2004-02-17 15:41:38 +03:00
|
|
|
/*
|
|
|
|
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
|
2007-07-05 08:29:09 +04:00
|
|
|
* Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net>
|
2007-08-08 20:16:03 +04:00
|
|
|
*
|
|
|
|
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
|
|
|
*
|
|
|
|
* NetSurf is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; version 2 of the License.
|
|
|
|
*
|
|
|
|
* NetSurf is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2004-03-27 21:44:26 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
/** \file
|
|
|
|
* Save HTML document with dependencies (implementation).
|
2004-02-17 15:41:38 +03:00
|
|
|
*/
|
|
|
|
|
2008-07-27 03:42:24 +04:00
|
|
|
#include "utils/config.h"
|
|
|
|
|
2005-05-21 19:59:19 +04:00
|
|
|
#define _GNU_SOURCE /* for strndup */
|
2004-03-24 03:07:21 +03:00
|
|
|
#include <assert.h>
|
2004-03-27 21:44:26 +03:00
|
|
|
#include <ctype.h>
|
2004-06-06 23:39:17 +04:00
|
|
|
#include <errno.h>
|
2007-07-05 08:29:09 +04:00
|
|
|
#include <stdio.h>
|
2004-02-17 15:41:38 +03:00
|
|
|
#include <string.h>
|
2004-03-27 21:44:26 +03:00
|
|
|
#include <sys/types.h>
|
|
|
|
#include <regex.h>
|
2007-05-31 02:39:54 +04:00
|
|
|
#include <libxml/HTMLtree.h>
|
|
|
|
#include <libxml/parserInternals.h>
|
2008-07-27 02:29:15 +04:00
|
|
|
#include "oslib/osfile.h"
|
2007-05-31 02:39:54 +04:00
|
|
|
#include "utils/config.h"
|
|
|
|
#include "content/content.h"
|
|
|
|
#include "css/css.h"
|
|
|
|
#include "render/box.h"
|
|
|
|
#include "riscos/gui.h"
|
|
|
|
#include "riscos/save_complete.h"
|
|
|
|
#include "utils/log.h"
|
|
|
|
#include "utils/url.h"
|
|
|
|
#include "utils/utils.h"
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
regex_t save_complete_import_re;
|
|
|
|
|
|
|
|
/** An entry in save_complete_list. */
|
|
|
|
struct save_complete_entry {
|
2004-06-06 23:39:17 +04:00
|
|
|
struct content *content;
|
|
|
|
struct save_complete_entry *next; /**< Next entry in list */
|
2004-03-24 03:07:21 +03:00
|
|
|
};
|
|
|
|
|
2004-04-06 02:36:48 +04:00
|
|
|
/** List of urls seen and saved so far. */
|
2004-06-06 23:39:17 +04:00
|
|
|
static struct save_complete_entry *save_complete_list = 0;
|
2004-04-06 02:36:48 +04:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
static bool save_complete_html(struct content *c, const char *path,
|
|
|
|
bool index);
|
|
|
|
static bool save_imported_sheets(struct content *c, const char *path);
|
2004-04-06 02:36:48 +04:00
|
|
|
static char * rewrite_stylesheet_urls(const char *source, unsigned int size,
|
2004-03-27 21:44:26 +03:00
|
|
|
int *osize, const char *base);
|
2004-06-06 23:39:17 +04:00
|
|
|
static bool rewrite_document_urls(xmlDoc *doc, const char *base);
|
|
|
|
static bool rewrite_urls(xmlNode *n, const char *base);
|
|
|
|
static bool rewrite_url(xmlNode *n, const char *attr, const char *base);
|
|
|
|
static bool save_complete_list_add(struct content *content);
|
|
|
|
static struct content * save_complete_list_find(const char *url);
|
|
|
|
static bool save_complete_list_check(struct content *content);
|
2008-07-27 02:29:15 +04:00
|
|
|
/* static void save_complete_list_dump(void); */
|
2007-07-05 08:29:09 +04:00
|
|
|
static bool save_complete_inventory(const char *path);
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
/**
|
|
|
|
* Save an HTML page with all dependencies.
|
|
|
|
*
|
|
|
|
* \param c CONTENT_HTML to save
|
|
|
|
* \param path directory to save to (must exist)
|
2004-06-06 23:39:17 +04:00
|
|
|
* \return true on success, false on error and error reported
|
2004-03-27 21:44:26 +03:00
|
|
|
*/
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool save_complete(struct content *c, const char *path)
|
2004-04-06 01:54:22 +04:00
|
|
|
{
|
2004-06-06 23:39:17 +04:00
|
|
|
bool result;
|
2004-04-13 17:31:54 +04:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
result = save_complete_html(c, path, true);
|
2004-04-06 02:36:48 +04:00
|
|
|
|
2007-07-05 08:29:09 +04:00
|
|
|
if (result)
|
|
|
|
result = save_complete_inventory(path);
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
/* free save_complete_list */
|
2004-04-06 02:36:48 +04:00
|
|
|
while (save_complete_list) {
|
|
|
|
struct save_complete_entry *next = save_complete_list->next;
|
|
|
|
free(save_complete_list);
|
|
|
|
save_complete_list = next;
|
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
|
|
|
|
return result;
|
2004-04-06 01:54:22 +04:00
|
|
|
}
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
|
2004-04-06 01:54:22 +04:00
|
|
|
/**
|
|
|
|
* Save an HTML page with all dependencies, recursing through imported pages.
|
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param c CONTENT_HTML to save
|
|
|
|
* \param path directory to save to (must exist)
|
|
|
|
* \param index true to save as "index"
|
|
|
|
* \return true on success, false on error and error reported
|
2004-04-06 01:54:22 +04:00
|
|
|
*/
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool save_complete_html(struct content *c, const char *path, bool index)
|
2004-03-27 21:44:26 +03:00
|
|
|
{
|
|
|
|
char spath[256];
|
2004-02-17 15:41:38 +03:00
|
|
|
unsigned int i;
|
2009-03-11 20:22:46 +03:00
|
|
|
xmlDocPtr doc;
|
2004-06-06 23:39:17 +04:00
|
|
|
os_error *error;
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-03-11 05:19:14 +03:00
|
|
|
if (c->type != CONTENT_HTML)
|
2004-06-06 23:39:17 +04:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (save_complete_list_check(c))
|
|
|
|
return true;
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-07-31 03:40:01 +04:00
|
|
|
/* save stylesheets, ignoring the base and adblocking sheets */
|
2009-07-24 03:05:34 +04:00
|
|
|
for (i = STYLESHEET_START; i != c->data.html.stylesheet_count; i++) {
|
2004-03-24 03:07:21 +03:00
|
|
|
struct content *css = c->data.html.stylesheet_content[i];
|
2004-03-27 21:44:26 +03:00
|
|
|
char *source;
|
|
|
|
int source_len;
|
2009-07-24 03:05:34 +04:00
|
|
|
bool is_style;
|
2004-03-11 05:19:14 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!css)
|
|
|
|
continue;
|
|
|
|
if (save_complete_list_check(css))
|
|
|
|
continue;
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2009-07-24 03:05:34 +04:00
|
|
|
is_style = (strcmp(css->url, c->data.html.base_url) == 0);
|
|
|
|
|
|
|
|
if (is_style == false) {
|
2007-07-05 08:29:09 +04:00
|
|
|
if (!save_complete_list_add(css)) {
|
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
}
|
2004-04-06 01:54:22 +04:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!save_imported_sheets(css, path))
|
|
|
|
return false;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2009-07-24 03:05:34 +04:00
|
|
|
if (is_style)
|
2004-06-06 23:39:17 +04:00
|
|
|
continue; /* don't save <style> elements */
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
snprintf(spath, sizeof spath, "%s.%x", path,
|
|
|
|
(unsigned int) css);
|
2004-04-06 02:36:48 +04:00
|
|
|
source = rewrite_stylesheet_urls(css->source_data,
|
2004-03-27 21:44:26 +03:00
|
|
|
css->source_size, &source_len, css->url);
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!source) {
|
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-03-27 05:07:17 +03:00
|
|
|
error = xosfile_save_stamped(spath, 0xf79,
|
|
|
|
(byte *) source, (byte *) source + source_len);
|
2004-06-06 23:39:17 +04:00
|
|
|
free(source);
|
|
|
|
if (error) {
|
|
|
|
LOG(("xosfile_save_stamped: 0x%x: %s",
|
|
|
|
error->errnum, error->errmess));
|
|
|
|
warn_user("SaveError", error->errmess);
|
|
|
|
return false;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
}
|
2004-02-17 15:41:38 +03:00
|
|
|
|
|
|
|
/* save objects */
|
2004-03-11 05:19:14 +03:00
|
|
|
for (i = 0; i != c->data.html.object_count; i++) {
|
|
|
|
struct content *obj = c->data.html.object[i].content;
|
2004-02-17 15:41:38 +03:00
|
|
|
|
|
|
|
/* skip difficult content types */
|
2004-03-27 21:44:26 +03:00
|
|
|
if (!obj || obj->type >= CONTENT_OTHER || !obj->source_data)
|
2004-02-17 15:41:38 +03:00
|
|
|
continue;
|
2004-06-06 23:39:17 +04:00
|
|
|
if (save_complete_list_check(obj))
|
|
|
|
continue;
|
2004-02-17 15:41:38 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!save_complete_list_add(obj)) {
|
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (obj->type == CONTENT_HTML) {
|
|
|
|
if (!save_complete_html(obj, path, false))
|
|
|
|
return false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(spath, sizeof spath, "%s.%x", path,
|
|
|
|
(unsigned int) obj);
|
|
|
|
error = xosfile_save_stamped(spath,
|
|
|
|
ro_content_filetype(obj),
|
2009-03-27 05:07:17 +03:00
|
|
|
(byte *) obj->source_data,
|
|
|
|
(byte *) obj->source_data + obj->source_size);
|
2004-06-06 23:39:17 +04:00
|
|
|
if (error) {
|
|
|
|
LOG(("xosfile_save_stamped: 0x%x: %s",
|
|
|
|
error->errnum, error->errmess));
|
|
|
|
warn_user("SaveError", error->errmess);
|
|
|
|
return false;
|
2004-04-06 01:54:22 +04:00
|
|
|
}
|
2004-02-17 15:41:38 +03:00
|
|
|
}
|
|
|
|
|
2005-05-22 00:29:43 +04:00
|
|
|
/*save_complete_list_dump();*/
|
|
|
|
|
2009-03-11 20:22:46 +03:00
|
|
|
/* copy document */
|
|
|
|
doc = xmlCopyDoc(c->data.html.document, 1);
|
|
|
|
if (doc == NULL) {
|
2004-06-06 23:39:17 +04:00
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
|
|
|
/* rewrite all urls we know about */
|
2009-03-11 20:22:46 +03:00
|
|
|
if (!rewrite_document_urls(doc, c->data.html.base_url)) {
|
|
|
|
xmlFreeDoc(doc);
|
2004-06-06 23:39:17 +04:00
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
|
|
|
/* save the html file out last of all */
|
2004-04-06 01:54:22 +04:00
|
|
|
if (index)
|
2004-06-06 23:39:17 +04:00
|
|
|
snprintf(spath, sizeof spath, "%s.index", path);
|
|
|
|
else
|
|
|
|
snprintf(spath, sizeof spath, "%s.%x", path, (unsigned int)c);
|
|
|
|
|
|
|
|
errno = 0;
|
2009-03-11 20:22:46 +03:00
|
|
|
if (htmlSaveFileFormat(spath, doc, 0, 0) == -1) {
|
2004-06-06 23:39:17 +04:00
|
|
|
if (errno)
|
|
|
|
warn_user("SaveError", strerror(errno));
|
|
|
|
else
|
|
|
|
warn_user("SaveError", "htmlSaveFileFormat failed");
|
2009-03-11 20:22:46 +03:00
|
|
|
|
|
|
|
xmlFreeDoc(doc);
|
2004-06-06 23:39:17 +04:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-03-11 20:22:46 +03:00
|
|
|
xmlFreeDoc(doc);
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
error = xosfile_set_type(spath, 0xfaf);
|
|
|
|
if (error) {
|
|
|
|
LOG(("xosfile_set_type: 0x%x: %s",
|
|
|
|
error->errnum, error->errmess));
|
|
|
|
warn_user("SaveError", error->errmess);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2004-02-17 15:41:38 +03:00
|
|
|
}
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
|
2004-04-06 01:54:22 +04:00
|
|
|
/**
|
2004-06-06 23:39:17 +04:00
|
|
|
* Save stylesheets imported by a CONTENT_CSS.
|
2004-04-06 01:54:22 +04:00
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param c a CONTENT_CSS
|
|
|
|
* \param path path to save to
|
|
|
|
* \return true on success, false on error and error reported
|
2004-04-06 01:54:22 +04:00
|
|
|
*/
|
2004-06-06 23:39:17 +04:00
|
|
|
|
|
|
|
bool save_imported_sheets(struct content *c, const char *path)
|
2004-03-11 05:19:14 +03:00
|
|
|
{
|
2004-03-27 21:44:26 +03:00
|
|
|
char spath[256];
|
2004-06-06 23:39:17 +04:00
|
|
|
unsigned int j;
|
|
|
|
char *source;
|
|
|
|
int source_len;
|
|
|
|
os_error *error;
|
2004-02-18 02:38:44 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
for (j = 0; j != c->data.css.import_count; j++) {
|
2009-07-24 03:05:34 +04:00
|
|
|
struct content *css = c->data.css.imports[j];
|
2004-03-11 05:19:14 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!css)
|
|
|
|
continue;
|
|
|
|
if (save_complete_list_check(css))
|
|
|
|
continue;
|
2004-03-11 05:19:14 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!save_complete_list_add(css)) {
|
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!save_imported_sheets(css, path))
|
|
|
|
return false;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
snprintf(spath, sizeof spath, "%s.%x", path,
|
|
|
|
(unsigned int) css);
|
2004-04-06 02:36:48 +04:00
|
|
|
source = rewrite_stylesheet_urls(css->source_data,
|
2004-03-27 21:44:26 +03:00
|
|
|
css->source_size, &source_len, css->url);
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!source) {
|
|
|
|
warn_user("NoMemory", 0);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-03-27 05:07:17 +03:00
|
|
|
error = xosfile_save_stamped(spath, 0xf79,
|
|
|
|
(byte *) source, (byte *) source + source_len);
|
2004-06-06 23:39:17 +04:00
|
|
|
free(source);
|
|
|
|
if (error) {
|
|
|
|
LOG(("xosfile_save_stamped: 0x%x: %s",
|
|
|
|
error->errnum, error->errmess));
|
|
|
|
warn_user("SaveError", error->errmess);
|
|
|
|
return false;
|
2004-03-27 21:44:26 +03:00
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
/**
|
|
|
|
* Initialise the save_complete module.
|
|
|
|
*/
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
void save_complete_init(void)
|
|
|
|
{
|
|
|
|
/* Match an @import rule - see CSS 2.1 G.1. */
|
|
|
|
regcomp_wrapper(&save_complete_import_re,
|
|
|
|
"@import" /* IMPORT_SYM */
|
|
|
|
"[ \t\r\n\f]*" /* S* */
|
|
|
|
/* 1 */
|
|
|
|
"(" /* [ */
|
|
|
|
/* 2 3 */
|
|
|
|
"\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */
|
|
|
|
"|"
|
|
|
|
/* 4 5 */
|
|
|
|
"'(([^']|[\\]')*)'"
|
|
|
|
"|" /* | */
|
|
|
|
"url\\([ \t\r\n\f]*" /* URI (approximated) */
|
|
|
|
/* 6 7 */
|
|
|
|
"\"(([^\"]|[\\]\")*)\""
|
|
|
|
"[ \t\r\n\f]*\\)"
|
|
|
|
"|"
|
|
|
|
"url\\([ \t\r\n\f]*"
|
|
|
|
/* 8 9 */
|
|
|
|
"'(([^']|[\\]')*)'"
|
|
|
|
"[ \t\r\n\f]*\\)"
|
|
|
|
"|"
|
|
|
|
"url\\([ \t\r\n\f]*"
|
|
|
|
/* 10 */
|
|
|
|
"([^) \t\r\n\f]*)"
|
|
|
|
"[ \t\r\n\f]*\\)"
|
|
|
|
")", /* ] */
|
|
|
|
REG_EXTENDED | REG_ICASE);
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-03-24 03:07:21 +03:00
|
|
|
/**
|
2004-05-02 00:16:38 +04:00
|
|
|
* Rewrite stylesheet \@import rules for save complete.
|
2004-03-24 03:07:21 +03:00
|
|
|
*
|
2004-03-27 21:44:26 +03:00
|
|
|
* @param source stylesheet source
|
|
|
|
* @param size size of source
|
|
|
|
* @param osize updated with the size of the result
|
|
|
|
* @param base url of stylesheet
|
2004-06-06 23:39:17 +04:00
|
|
|
* @return converted source, or 0 on out of memory
|
2004-03-24 03:07:21 +03:00
|
|
|
*/
|
|
|
|
|
2004-04-06 02:36:48 +04:00
|
|
|
char * rewrite_stylesheet_urls(const char *source, unsigned int size,
|
2004-03-27 21:44:26 +03:00
|
|
|
int *osize, const char *base)
|
|
|
|
{
|
|
|
|
char *res;
|
|
|
|
const char *url;
|
|
|
|
char *url2;
|
|
|
|
char buf[20];
|
|
|
|
unsigned int offset = 0;
|
|
|
|
int url_len = 0;
|
2004-06-06 23:39:17 +04:00
|
|
|
struct content *content;
|
2004-03-27 21:44:26 +03:00
|
|
|
int m;
|
|
|
|
unsigned int i;
|
|
|
|
unsigned int imports = 0;
|
|
|
|
regmatch_t match[11];
|
2004-08-09 20:11:58 +04:00
|
|
|
url_func_result result;
|
2004-03-27 21:44:26 +03:00
|
|
|
|
|
|
|
/* count number occurences of @import to (over)estimate result size */
|
|
|
|
/* can't use strstr because source is not 0-terminated string */
|
|
|
|
for (i = 0; 7 < size && i != size - 7; i++) {
|
|
|
|
if (source[i] == '@' &&
|
|
|
|
tolower(source[i + 1]) == 'i' &&
|
|
|
|
tolower(source[i + 2]) == 'm' &&
|
|
|
|
tolower(source[i + 3]) == 'p' &&
|
|
|
|
tolower(source[i + 4]) == 'o' &&
|
|
|
|
tolower(source[i + 5]) == 'r' &&
|
|
|
|
tolower(source[i + 6]) == 't')
|
|
|
|
imports++;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
res = malloc(size + imports * 20);
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!res)
|
2004-03-27 21:44:26 +03:00
|
|
|
return 0;
|
|
|
|
*osize = 0;
|
|
|
|
|
|
|
|
while (offset < size) {
|
|
|
|
m = regexec(&save_complete_import_re, source + offset,
|
|
|
|
11, match, 0);
|
|
|
|
if (m)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*for (unsigned int i = 0; i != 11; i++) {
|
|
|
|
if (match[i].rm_so == -1)
|
|
|
|
continue;
|
|
|
|
fprintf(stderr, "%i: '%.*s'\n", i,
|
|
|
|
match[i].rm_eo - match[i].rm_so,
|
|
|
|
source + offset + match[i].rm_so);
|
|
|
|
}*/
|
|
|
|
|
|
|
|
url = 0;
|
|
|
|
if (match[2].rm_so != -1) {
|
|
|
|
url = source + offset + match[2].rm_so;
|
|
|
|
url_len = match[2].rm_eo - match[2].rm_so;
|
|
|
|
} else if (match[4].rm_so != -1) {
|
|
|
|
url = source + offset + match[4].rm_so;
|
|
|
|
url_len = match[4].rm_eo - match[4].rm_so;
|
|
|
|
} else if (match[6].rm_so != -1) {
|
|
|
|
url = source + offset + match[6].rm_so;
|
|
|
|
url_len = match[6].rm_eo - match[6].rm_so;
|
|
|
|
} else if (match[8].rm_so != -1) {
|
|
|
|
url = source + offset + match[8].rm_so;
|
|
|
|
url_len = match[8].rm_eo - match[8].rm_so;
|
|
|
|
} else if (match[10].rm_so != -1) {
|
|
|
|
url = source + offset + match[10].rm_so;
|
|
|
|
url_len = match[10].rm_eo - match[10].rm_so;
|
|
|
|
}
|
|
|
|
assert(url);
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
url2 = strndup(url, url_len);
|
|
|
|
if (!url2) {
|
|
|
|
free(res);
|
|
|
|
return 0;
|
|
|
|
}
|
2004-08-09 20:11:58 +04:00
|
|
|
result = url_join(url2, base, (char**)&url);
|
2004-03-27 21:44:26 +03:00
|
|
|
free(url2);
|
2004-08-09 20:11:58 +04:00
|
|
|
if (result == URL_FUNC_NOMEM) {
|
2004-03-27 21:44:26 +03:00
|
|
|
free(res);
|
|
|
|
return 0;
|
2004-06-06 23:39:17 +04:00
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
/* copy data before match */
|
|
|
|
memcpy(res + *osize, source + offset, match[0].rm_so);
|
|
|
|
*osize += match[0].rm_so;
|
|
|
|
|
2004-08-09 20:11:58 +04:00
|
|
|
if (result == URL_FUNC_OK) {
|
|
|
|
content = save_complete_list_find(url);
|
|
|
|
if (content) {
|
|
|
|
/* replace import */
|
|
|
|
snprintf(buf, sizeof buf, "@import '%x'",
|
|
|
|
(unsigned int) content);
|
|
|
|
memcpy(res + *osize, buf, strlen(buf));
|
|
|
|
*osize += strlen(buf);
|
|
|
|
} else {
|
|
|
|
/* copy import */
|
|
|
|
memcpy(res + *osize, source + offset + match[0].rm_so,
|
|
|
|
match[0].rm_eo - match[0].rm_so);
|
|
|
|
*osize += match[0].rm_eo - match[0].rm_so;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2004-03-27 21:44:26 +03:00
|
|
|
/* copy import */
|
|
|
|
memcpy(res + *osize, source + offset + match[0].rm_so,
|
2004-08-09 20:11:58 +04:00
|
|
|
match[0].rm_eo - match[0].rm_so);
|
2004-03-27 21:44:26 +03:00
|
|
|
*osize += match[0].rm_eo - match[0].rm_so;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
assert(0 < match[0].rm_eo);
|
|
|
|
offset += match[0].rm_eo;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
/* copy rest of source */
|
|
|
|
if (offset < size) {
|
|
|
|
memcpy(res + *osize, source + offset, size - offset);
|
|
|
|
*osize += size - offset;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
return res;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2004-06-06 23:39:17 +04:00
|
|
|
* Rewrite URLs in a HTML document to be relative.
|
2004-03-24 03:07:21 +03:00
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param doc root of the document tree
|
2004-03-27 21:44:26 +03:00
|
|
|
* \param base base url of document
|
2004-06-06 23:39:17 +04:00
|
|
|
* \return true on success, false on out of memory
|
2004-03-24 03:07:21 +03:00
|
|
|
*/
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool rewrite_document_urls(xmlDoc *doc, const char *base)
|
2004-03-24 03:07:21 +03:00
|
|
|
{
|
2004-06-06 23:39:17 +04:00
|
|
|
xmlNode *node;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
for (node = doc->children; node; node = node->next)
|
|
|
|
if (node->type == XML_ELEMENT_NODE)
|
|
|
|
if (!rewrite_urls(node, base))
|
|
|
|
return false;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
return true;
|
2004-02-18 02:38:44 +03:00
|
|
|
}
|
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-03-24 03:07:21 +03:00
|
|
|
/**
|
|
|
|
* Traverse tree, rewriting URLs as we go.
|
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param n xmlNode of type XML_ELEMENT_NODE to rewrite
|
2004-04-06 01:54:22 +04:00
|
|
|
* \param base base url of document
|
2004-06-06 23:39:17 +04:00
|
|
|
* \return true on success, false on out of memory
|
|
|
|
*
|
|
|
|
* URLs in the tree rooted at element n are rewritten.
|
2004-03-24 03:07:21 +03:00
|
|
|
*/
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool rewrite_urls(xmlNode *n, const char *base)
|
2004-03-24 03:07:21 +03:00
|
|
|
{
|
2004-06-06 23:39:17 +04:00
|
|
|
xmlNode *child;
|
|
|
|
|
2005-05-22 00:29:43 +04:00
|
|
|
assert(n->type == XML_ELEMENT_NODE);
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
/**
|
|
|
|
* We only need to consider the following cases:
|
|
|
|
*
|
|
|
|
* Attribute: Elements:
|
|
|
|
*
|
|
|
|
* 1) data <object>
|
2005-05-22 00:29:43 +04:00
|
|
|
* 2) href <a> <area> <link>
|
2004-06-06 23:39:17 +04:00
|
|
|
* 3) src <script> <input> <frame> <iframe> <img>
|
2005-05-22 00:29:43 +04:00
|
|
|
* 4) n/a <style>
|
|
|
|
* 5) n/a any <base> tag
|
|
|
|
* 6) background any (except those above)
|
2004-06-06 23:39:17 +04:00
|
|
|
*/
|
2004-06-29 16:38:49 +04:00
|
|
|
if (!n->name) {
|
|
|
|
/* ignore */
|
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
/* 1 */
|
2009-03-27 05:07:17 +03:00
|
|
|
else if (strcmp((const char *) n->name, "object") == 0) {
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!rewrite_url(n, "data", base))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/* 2 */
|
2009-03-27 05:07:17 +03:00
|
|
|
else if (strcmp((const char *) n->name, "a") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "area") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "link") == 0) {
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!rewrite_url(n, "href", base))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/* 3 */
|
2009-03-27 05:07:17 +03:00
|
|
|
else if (strcmp((const char *) n->name, "frame") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "iframe") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "input") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "img") == 0 ||
|
|
|
|
strcmp((const char *) n->name, "script") == 0) {
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!rewrite_url(n, "src", base))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/* 4 */
|
2009-03-27 05:07:17 +03:00
|
|
|
else if (strcmp((const char *) n->name, "style") == 0) {
|
2004-06-06 23:39:17 +04:00
|
|
|
unsigned int len;
|
|
|
|
xmlChar *content;
|
|
|
|
|
|
|
|
for (child = n->children; child != 0; child = child->next) {
|
|
|
|
/* Get current content */
|
|
|
|
content = xmlNodeGetContent(child);
|
|
|
|
if (!content)
|
2005-05-22 00:29:43 +04:00
|
|
|
/* unfortunately we don't know if this is
|
|
|
|
* due to memory exhaustion, or because
|
|
|
|
* there is no content for this node */
|
2004-06-06 23:39:17 +04:00
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Rewrite @import rules */
|
|
|
|
char *rewritten = rewrite_stylesheet_urls(
|
2009-03-27 05:07:17 +03:00
|
|
|
(const char *) content,
|
|
|
|
strlen((const char *) content),
|
|
|
|
(int *) &len, base);
|
2004-06-06 23:39:17 +04:00
|
|
|
xmlFree(content);
|
|
|
|
if (!rewritten)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* set new content */
|
|
|
|
xmlNodeSetContentLen(child,
|
|
|
|
(const xmlChar*)rewritten,
|
|
|
|
len);
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
2004-06-10 03:21:24 +04:00
|
|
|
/* 5 */
|
2009-03-27 05:07:17 +03:00
|
|
|
else if (strcmp((const char *) n->name, "base") == 0) {
|
2005-05-22 00:29:43 +04:00
|
|
|
/* simply remove any <base> tags from the document */
|
|
|
|
xmlUnlinkNode(n);
|
|
|
|
xmlFreeNode(n);
|
|
|
|
/* base tags have no content, so there's no point recursing
|
|
|
|
* additionally, we've just destroyed this node, so trying
|
|
|
|
* to recurse would result in bad things happening */
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
/* 6 */
|
2004-06-10 03:21:24 +04:00
|
|
|
else {
|
|
|
|
if (!rewrite_url(n, "background", base))
|
|
|
|
return false;
|
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
/* now recurse */
|
2005-05-22 00:29:43 +04:00
|
|
|
for (child = n->children; child;) {
|
|
|
|
/* we must extract the next child now, as if the current
|
|
|
|
* child is a <base> element, it will be removed from the
|
|
|
|
* tree (see 5, above), thus preventing extraction of the
|
|
|
|
* next child */
|
|
|
|
xmlNode *next = child->next;
|
|
|
|
if (child->type == XML_ELEMENT_NODE) {
|
|
|
|
if (!rewrite_urls(child, base))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
child = next;
|
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
|
|
|
|
return true;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-03-24 03:44:52 +03:00
|
|
|
/**
|
|
|
|
* Rewrite an URL in a HTML document.
|
|
|
|
*
|
2004-03-27 21:44:26 +03:00
|
|
|
* \param n The node to modify
|
|
|
|
* \param attr The html attribute to modify
|
|
|
|
* \param base base url of document
|
2004-06-06 23:39:17 +04:00
|
|
|
* \return true on success, false on out of memory
|
2004-03-24 03:44:52 +03:00
|
|
|
*/
|
2004-03-27 21:44:26 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool rewrite_url(xmlNode *n, const char *attr, const char *base)
|
2004-03-24 03:07:21 +03:00
|
|
|
{
|
2004-06-06 23:39:17 +04:00
|
|
|
char *url, *data;
|
|
|
|
char rel[20];
|
|
|
|
struct content *content;
|
2004-08-09 20:11:58 +04:00
|
|
|
url_func_result res;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!xmlHasProp(n, (const xmlChar *) attr))
|
|
|
|
return true;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2009-03-27 05:07:17 +03:00
|
|
|
data = (char *) xmlGetProp(n, (const xmlChar *) attr);
|
2004-06-06 23:39:17 +04:00
|
|
|
if (!data)
|
|
|
|
return false;
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-08-09 20:11:58 +04:00
|
|
|
res = url_join(data, base, &url);
|
2004-06-06 23:39:17 +04:00
|
|
|
xmlFree(data);
|
2004-08-09 20:11:58 +04:00
|
|
|
if (res == URL_FUNC_NOMEM)
|
2004-06-06 23:39:17 +04:00
|
|
|
return false;
|
2004-08-09 20:11:58 +04:00
|
|
|
else if (res == URL_FUNC_OK) {
|
|
|
|
content = save_complete_list_find(url);
|
|
|
|
if (content) {
|
|
|
|
/* found a match */
|
|
|
|
free(url);
|
|
|
|
snprintf(rel, sizeof rel, "%x",
|
|
|
|
(unsigned int) content);
|
|
|
|
if (!xmlSetProp(n, (const xmlChar *) attr,
|
|
|
|
(xmlChar *) rel))
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
/* no match found */
|
|
|
|
if (!xmlSetProp(n, (const xmlChar *) attr,
|
|
|
|
(xmlChar *) url)) {
|
|
|
|
free(url);
|
|
|
|
return false;
|
|
|
|
}
|
2004-06-06 23:39:17 +04:00
|
|
|
free(url);
|
|
|
|
}
|
2004-03-27 21:44:26 +03:00
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
return true;
|
2004-03-27 21:44:26 +03:00
|
|
|
}
|
2004-03-24 03:07:21 +03:00
|
|
|
|
|
|
|
|
2004-03-27 21:44:26 +03:00
|
|
|
/**
|
2004-06-06 23:39:17 +04:00
|
|
|
* Add a content to the save_complete_list.
|
2004-03-27 21:44:26 +03:00
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param content content to add
|
|
|
|
* \return true on success, false on out of memory
|
2004-03-27 21:44:26 +03:00
|
|
|
*/
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
bool save_complete_list_add(struct content *content)
|
2004-03-27 21:44:26 +03:00
|
|
|
{
|
|
|
|
struct save_complete_entry *entry;
|
|
|
|
entry = malloc(sizeof (*entry));
|
|
|
|
if (!entry)
|
2004-06-06 23:39:17 +04:00
|
|
|
return false;
|
|
|
|
entry->content = content;
|
2004-04-06 02:36:48 +04:00
|
|
|
entry->next = save_complete_list;
|
|
|
|
save_complete_list = entry;
|
2004-06-06 23:39:17 +04:00
|
|
|
return true;
|
2004-03-27 21:44:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Look up a url in the save_complete_list.
|
|
|
|
*
|
2004-06-06 23:39:17 +04:00
|
|
|
* \param url url to find
|
|
|
|
* \return content if found, 0 otherwise
|
2004-03-27 21:44:26 +03:00
|
|
|
*/
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
struct content * save_complete_list_find(const char *url)
|
2004-03-27 21:44:26 +03:00
|
|
|
{
|
|
|
|
struct save_complete_entry *entry;
|
|
|
|
for (entry = save_complete_list; entry; entry = entry->next)
|
2004-06-06 23:39:17 +04:00
|
|
|
if (strcmp(url, entry->content->url) == 0)
|
|
|
|
return entry->content;
|
2004-03-27 21:44:26 +03:00
|
|
|
return 0;
|
2004-03-24 03:07:21 +03:00
|
|
|
}
|
|
|
|
|
2004-06-06 23:39:17 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Look up a content in the save_complete_list.
|
|
|
|
*
|
|
|
|
* \param content pointer to content
|
|
|
|
* \return true if the content is in the save_complete_list
|
|
|
|
*/
|
|
|
|
|
|
|
|
bool save_complete_list_check(struct content *content)
|
|
|
|
{
|
|
|
|
struct save_complete_entry *entry;
|
|
|
|
for (entry = save_complete_list; entry; entry = entry->next)
|
|
|
|
if (entry->content == content)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2008-07-27 02:29:15 +04:00
|
|
|
|
|
|
|
#if 0
|
2005-05-22 00:29:43 +04:00
|
|
|
/**
|
|
|
|
* Dump save complete list to stderr
|
|
|
|
*/
|
|
|
|
void save_complete_list_dump(void)
|
|
|
|
{
|
|
|
|
struct save_complete_entry *entry;
|
|
|
|
for (entry = save_complete_list; entry; entry = entry->next)
|
|
|
|
fprintf(stderr, "%p : %s\n", entry->content,
|
|
|
|
entry->content->url);
|
|
|
|
}
|
2008-07-27 02:29:15 +04:00
|
|
|
#endif
|
2005-05-22 00:29:43 +04:00
|
|
|
|
2007-07-05 08:29:09 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Create the inventory file listing original URLs.
|
|
|
|
*/
|
|
|
|
|
|
|
|
bool save_complete_inventory(const char *path)
|
|
|
|
{
|
|
|
|
char spath[256];
|
|
|
|
FILE *fp;
|
|
|
|
|
|
|
|
snprintf(spath, sizeof spath, "%s.Inventory", path);
|
|
|
|
|
|
|
|
fp = fopen(spath, "w");
|
|
|
|
if (!fp) {
|
|
|
|
LOG(("fopen(): errno = %i", errno));
|
|
|
|
warn_user("SaveError", strerror(errno));
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct save_complete_entry *entry;
|
|
|
|
for (entry = save_complete_list; entry; entry = entry->next)
|
|
|
|
fprintf(fp, "%x %s\n",
|
|
|
|
(unsigned int) entry->content,
|
|
|
|
entry->content->url);
|
|
|
|
|
|
|
|
fclose(fp);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|