mirror of
https://github.com/netsurf-browser/netsurf
synced 2025-02-16 22:43:58 +03:00
Correctly emit entities when serialising to HTML.
This commit is contained in:
parent
f12be4ed85
commit
ea3f09a724
@ -39,8 +39,10 @@
|
||||
#include "desktop/save_complete.h"
|
||||
#include "render/box.h"
|
||||
#include "render/html.h"
|
||||
#include "utils/corestrings.h"
|
||||
#include "utils/log.h"
|
||||
#include "utils/nsurl.h"
|
||||
#include "utils/utf8.h"
|
||||
#include "utils/utils.h"
|
||||
|
||||
regex_t save_complete_import_re;
|
||||
@ -551,7 +553,9 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
|
||||
{
|
||||
nsurl *url;
|
||||
hlcache_handle *content;
|
||||
char *escaped;
|
||||
nserror error;
|
||||
utf8_convert_ret ret;
|
||||
|
||||
error = nsurl_join(ctx->base, value, &url);
|
||||
if (error == NSERROR_NOMEM)
|
||||
@ -566,11 +570,25 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
|
||||
fprintf(ctx->fp, "\"%p\"", content);
|
||||
} else {
|
||||
/* no match found */
|
||||
fprintf(ctx->fp, "\"%s\"", nsurl_access(url));
|
||||
ret = utf8_to_html(nsurl_access(url), "UTF-8",
|
||||
nsurl_length(url), &escaped);
|
||||
nsurl_unref(url);
|
||||
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
|
||||
fprintf(ctx->fp, "\"%s\"", escaped);
|
||||
|
||||
free(escaped);
|
||||
}
|
||||
} else {
|
||||
fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value);
|
||||
ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
|
||||
fprintf(ctx->fp, "\"%s\"", escaped);
|
||||
|
||||
free(escaped);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -579,7 +597,16 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
|
||||
static bool save_complete_write_value(save_complete_ctx *ctx,
|
||||
const char *value, size_t value_len)
|
||||
{
|
||||
fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value);
|
||||
char *escaped;
|
||||
utf8_convert_ret ret;
|
||||
|
||||
ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
|
||||
fprintf(ctx->fp, "\"%s\"", escaped);
|
||||
|
||||
free(escaped);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -728,7 +755,7 @@ static bool save_complete_handle_attrs(save_complete_ctx *ctx,
|
||||
for (i = 0; i < length; i++) {
|
||||
dom_attr *attr;
|
||||
|
||||
error = dom_namednodemap_item(attrs, i, &attr);
|
||||
error = dom_namednodemap_item(attrs, i, (void *) &attr);
|
||||
if (error != DOM_NO_ERR)
|
||||
return false;
|
||||
|
||||
@ -753,6 +780,7 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
|
||||
dom_namednodemap *attrs;
|
||||
const char *name_data;
|
||||
size_t name_len;
|
||||
bool process = true;
|
||||
dom_exception error;
|
||||
|
||||
ctx->iter_state = STATE_NORMAL;
|
||||
@ -767,9 +795,56 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
|
||||
name_data = dom_string_data(name);
|
||||
name_len = dom_string_byte_length(name);
|
||||
|
||||
/* Elide BASE elements from the output */
|
||||
if (name_len == SLEN("base") &&
|
||||
strncasecmp(name_data, "base", name_len) == 0) {
|
||||
/* Elide BASE elements from the output */
|
||||
process = false;
|
||||
} else if (name_len == SLEN("meta") &&
|
||||
strncasecmp(name_data, "meta", name_len) == 0) {
|
||||
/* Don't emit close tags for META elements */
|
||||
if (event_type == EVENT_LEAVE) {
|
||||
process = false;
|
||||
} else {
|
||||
/* Elide meta charsets */
|
||||
dom_string *value;
|
||||
error = dom_element_get_attribute(node,
|
||||
corestring_dom_http_equiv, &value);
|
||||
if (error != DOM_NO_ERR) {
|
||||
dom_string_unref(name);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (value != NULL) {
|
||||
if (dom_string_length(value) ==
|
||||
SLEN("Content-Type") &&
|
||||
strncasecmp(dom_string_data(value),
|
||||
"Content-Type",
|
||||
SLEN("Content-Type")) == 0)
|
||||
process = false;
|
||||
|
||||
dom_string_unref(value);
|
||||
} else {
|
||||
bool yes;
|
||||
|
||||
error = dom_element_has_attribute(node,
|
||||
corestring_dom_charset, &yes);
|
||||
if (error != DOM_NO_ERR) {
|
||||
dom_string_unref(name);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (yes)
|
||||
process = false;
|
||||
}
|
||||
}
|
||||
} else if (event_type == EVENT_LEAVE &&
|
||||
((name_len == SLEN("link") &&
|
||||
strncasecmp(name_data, "link", name_len) == 0))) {
|
||||
/* Don't emit close tags for void elements */
|
||||
process = false;
|
||||
}
|
||||
|
||||
if (process == false) {
|
||||
dom_string_unref(name);
|
||||
return true;
|
||||
}
|
||||
@ -833,6 +908,12 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
|
||||
}
|
||||
|
||||
ctx->iter_state = STATE_IN_STYLE;
|
||||
} else if (event_type == EVENT_ENTER && name_len == SLEN("head") &&
|
||||
strncasecmp(name_data, "head", name_len) == 0) {
|
||||
/* If this is a HEAD element, insert a meta charset */
|
||||
fputs("<META http-equiv=\"Content-Type\" "
|
||||
"content=\"text/html; charset=utf-8\">",
|
||||
ctx->fp);
|
||||
}
|
||||
|
||||
dom_string_unref(name);
|
||||
@ -846,6 +927,7 @@ static bool save_complete_node_handler(dom_node *node,
|
||||
save_complete_ctx *ctx = ctxin;
|
||||
dom_node_type type;
|
||||
dom_exception error;
|
||||
utf8_convert_ret ret;
|
||||
|
||||
error = dom_node_get_node_type(node, &type);
|
||||
if (error != DOM_NO_ERR)
|
||||
@ -872,11 +954,20 @@ static bool save_complete_node_handler(dom_node *node,
|
||||
fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp);
|
||||
|
||||
if (text != NULL) {
|
||||
char *escaped;
|
||||
|
||||
text_data = dom_string_data(text);
|
||||
text_len = dom_string_byte_length(text);
|
||||
|
||||
fwrite(text_data, sizeof(*text_data),
|
||||
text_len, ctx->fp);
|
||||
ret = utf8_to_html(text_data, "UTF-8",
|
||||
text_len, &escaped);
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
|
||||
fwrite(escaped, sizeof(*escaped),
|
||||
strlen(escaped), ctx->fp);
|
||||
|
||||
free(escaped);
|
||||
|
||||
dom_string_unref(text);
|
||||
}
|
||||
@ -917,8 +1008,9 @@ static bool save_complete_node_handler(dom_node *node,
|
||||
name_data = dom_string_data(name);
|
||||
name_len = dom_string_byte_length(name);
|
||||
|
||||
fprintf(ctx->fp, " PUBLIC \"%.*s\"",
|
||||
(int) name_len, name_data);
|
||||
if (name_len > 0)
|
||||
fprintf(ctx->fp, " PUBLIC \"%.*s\"",
|
||||
(int) name_len, name_data);
|
||||
|
||||
dom_string_unref(name);
|
||||
}
|
||||
@ -931,8 +1023,9 @@ static bool save_complete_node_handler(dom_node *node,
|
||||
name_data = dom_string_data(name);
|
||||
name_len = dom_string_byte_length(name);
|
||||
|
||||
fprintf(ctx->fp, " \"%.*s\"",
|
||||
(int) name_len, name_data);
|
||||
if (name_len > 0)
|
||||
fprintf(ctx->fp, " \"%.*s\"",
|
||||
(int) name_len, name_data);
|
||||
|
||||
dom_string_unref(name);
|
||||
}
|
||||
|
@ -817,11 +817,11 @@ static bool tree_url_save_entry(struct node *entry, FILE *fp)
|
||||
if (href == NULL)
|
||||
return false;
|
||||
|
||||
ret = utf8_to_enc(text, "iso-8859-1", strlen(text), &latin1_text);
|
||||
ret = utf8_to_html(text, "iso-8859-1", strlen(text), &latin1_text);
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
|
||||
ret = utf8_to_enc(href, "iso-8859-1", strlen(href), &latin1_href);
|
||||
ret = utf8_to_html(href, "iso-8859-1", strlen(href), &latin1_href);
|
||||
if (ret != UTF8_CONVERT_OK) {
|
||||
free(latin1_text);
|
||||
return false;
|
||||
@ -872,7 +872,7 @@ static bool tree_url_save_directory(struct node *directory, FILE *fp)
|
||||
if (text == NULL)
|
||||
return false;
|
||||
|
||||
ret = utf8_to_enc(text, "iso-8859-1",
|
||||
ret = utf8_to_html(text, "iso-8859-1",
|
||||
strlen(text), &latin1_text);
|
||||
if (ret != UTF8_CONVERT_OK)
|
||||
return false;
|
||||
@ -919,7 +919,7 @@ bool tree_urlfile_save(struct tree *tree, const char *filename,
|
||||
fputs("<meta http-equiv=\"Content-Type\" "
|
||||
"content=\"text/html; charset=iso-8859-1\">\n", fp);
|
||||
fprintf(fp, "<title>%s</title>\n", page_title);
|
||||
fputs("<body>", fp);
|
||||
fputs("</head>\n<body>", fp);
|
||||
|
||||
if (tree_url_save_directory(tree_get_root(tree), fp) == false) {
|
||||
warn_user("HotlistSaveError", 0);
|
||||
|
@ -133,6 +133,7 @@ dom_string *corestring_dom_canplaythrough;
|
||||
dom_string *corestring_dom_cellpadding;
|
||||
dom_string *corestring_dom_cellspacing;
|
||||
dom_string *corestring_dom_change;
|
||||
dom_string *corestring_dom_charset;
|
||||
dom_string *corestring_dom_click;
|
||||
dom_string *corestring_dom_close;
|
||||
dom_string *corestring_dom_color;
|
||||
@ -355,6 +356,7 @@ void corestrings_fini(void)
|
||||
CSS_DOM_STRING_UNREF(cellpadding);
|
||||
CSS_DOM_STRING_UNREF(cellspacing);
|
||||
CSS_DOM_STRING_UNREF(change);
|
||||
CSS_DOM_STRING_UNREF(charset);
|
||||
CSS_DOM_STRING_UNREF(click);
|
||||
CSS_DOM_STRING_UNREF(close);
|
||||
CSS_DOM_STRING_UNREF(color);
|
||||
@ -608,6 +610,7 @@ nserror corestrings_init(void)
|
||||
CSS_DOM_STRING_INTERN(cellpadding);
|
||||
CSS_DOM_STRING_INTERN(cellspacing);
|
||||
CSS_DOM_STRING_INTERN(change);
|
||||
CSS_DOM_STRING_INTERN(charset);
|
||||
CSS_DOM_STRING_INTERN(click);
|
||||
CSS_DOM_STRING_INTERN(close);
|
||||
CSS_DOM_STRING_INTERN(color);
|
||||
|
@ -139,6 +139,7 @@ extern struct dom_string *corestring_dom_canplaythrough;
|
||||
extern struct dom_string *corestring_dom_cellpadding;
|
||||
extern struct dom_string *corestring_dom_cellspacing;
|
||||
extern struct dom_string *corestring_dom_change;
|
||||
extern struct dom_string *corestring_dom_charset;
|
||||
extern struct dom_string *corestring_dom_click;
|
||||
extern struct dom_string *corestring_dom_close;
|
||||
extern struct dom_string *corestring_dom_color;
|
||||
|
139
utils/utf8.c
139
utils/utf8.c
@ -297,7 +297,7 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
|
||||
}
|
||||
|
||||
slen = len ? len : strlen(string);
|
||||
/* Worst case = ACSII -> UCS4, so allocate an output buffer
|
||||
/* Worst case = ASCII -> UCS4, so allocate an output buffer
|
||||
* 4 times larger than the input buffer, and add 4 bytes at
|
||||
* the end for the NULL terminator
|
||||
*/
|
||||
@ -337,3 +337,140 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
|
||||
|
||||
return UTF8_CONVERT_OK;
|
||||
}
|
||||
|
||||
static utf8_convert_ret utf8_convert_html_chunk(iconv_t cd,
|
||||
const char *chunk, size_t inlen,
|
||||
char **out, size_t *outlen)
|
||||
{
|
||||
size_t ret, esclen;
|
||||
uint32_t ucs4;
|
||||
char *pescape, escape[11];
|
||||
|
||||
while (inlen > 0) {
|
||||
ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);
|
||||
if (ret != (size_t) -1)
|
||||
break;
|
||||
|
||||
if (errno != EILSEQ)
|
||||
return UTF8_CONVERT_NOMEM;
|
||||
|
||||
ucs4 = utf8_to_ucs4(chunk, inlen);
|
||||
esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
|
||||
pescape = escape;
|
||||
ret = iconv(cd, (void *) &pescape, &esclen,
|
||||
(void *) out, outlen);
|
||||
if (ret == (size_t) -1)
|
||||
return UTF8_CONVERT_NOMEM;
|
||||
|
||||
esclen = utf8_next(chunk, inlen, 0);
|
||||
chunk += esclen;
|
||||
inlen -= esclen;
|
||||
}
|
||||
|
||||
return UTF8_CONVERT_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-8 encoded string into a string of the given encoding,
|
||||
* applying HTML escape sequences where necessary.
|
||||
*
|
||||
* \param string String to convert (NUL-terminated)
|
||||
* \param encname Name of encoding to convert to
|
||||
* \param len Length, in bytes, of the input string, or 0
|
||||
* \param result Pointer to location to receive result
|
||||
* \return Appropriate utf8_convert_ret value
|
||||
*/
|
||||
utf8_convert_ret utf8_to_html(const char *string, const char *encname,
|
||||
size_t len, char **result)
|
||||
{
|
||||
iconv_t cd;
|
||||
const char *in;
|
||||
char *out, *origout;
|
||||
size_t off, prev_off, inlen, outlen, origoutlen, esclen;
|
||||
utf8_convert_ret ret;
|
||||
char *pescape, escape[11];
|
||||
|
||||
if (len == 0)
|
||||
len = strlen(string);
|
||||
|
||||
cd = iconv_open(encname, "UTF-8");
|
||||
if (cd == (iconv_t) -1) {
|
||||
if (errno == EINVAL)
|
||||
return UTF8_CONVERT_BADENC;
|
||||
/* default to no memory */
|
||||
return UTF8_CONVERT_NOMEM;
|
||||
}
|
||||
|
||||
/* Worst case is ASCII -> UCS4, with all characters escaped:
|
||||
* "&#xYYYYYY;", thus each input character may become a string
|
||||
* of 10 UCS4 characters, each 4 bytes in length */
|
||||
origoutlen = outlen = len * 10 * 4;
|
||||
origout = out = malloc(outlen);
|
||||
if (out == NULL) {
|
||||
iconv_close(cd);
|
||||
return UTF8_CONVERT_NOMEM;
|
||||
}
|
||||
|
||||
/* Process input in chunks between characters we must escape */
|
||||
prev_off = off = 0;
|
||||
while (off < len) {
|
||||
/* Must escape '&', '<', and '>' */
|
||||
if (string[off] == '&' || string[off] == '<' ||
|
||||
string[off] == '>') {
|
||||
if (off - prev_off > 0) {
|
||||
/* Emit chunk */
|
||||
in = string + prev_off;
|
||||
inlen = off - prev_off;
|
||||
ret = utf8_convert_html_chunk(cd, in, inlen,
|
||||
&out, &outlen);
|
||||
if (ret != UTF8_CONVERT_OK) {
|
||||
free(origout);
|
||||
iconv_close(cd);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* Emit mandatory escape */
|
||||
esclen = snprintf(escape, sizeof(escape),
|
||||
"&#x%06x;", string[off]);
|
||||
pescape = escape;
|
||||
ret = utf8_convert_html_chunk(cd, pescape, esclen,
|
||||
&out, &outlen);
|
||||
if (ret != UTF8_CONVERT_OK) {
|
||||
free(origout);
|
||||
iconv_close(cd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
prev_off = off = utf8_next(string, len, off);
|
||||
} else {
|
||||
off = utf8_next(string, len, off);
|
||||
}
|
||||
}
|
||||
|
||||
/* Process final chunk */
|
||||
if (prev_off < len) {
|
||||
in = string + prev_off;
|
||||
inlen = len - prev_off;
|
||||
ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);
|
||||
if (ret != UTF8_CONVERT_OK) {
|
||||
free(origout);
|
||||
iconv_close(cd);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
iconv_close(cd);
|
||||
|
||||
/* Shrink-wrap */
|
||||
*result = realloc(origout, origoutlen - outlen + 4);
|
||||
if (*result == NULL) {
|
||||
free(origout);
|
||||
return UTF8_CONVERT_NOMEM;
|
||||
}
|
||||
memset(*result + (origoutlen - outlen), 0, 4);
|
||||
|
||||
return UTF8_CONVERT_OK;
|
||||
}
|
||||
|
||||
|
||||
|
@ -47,6 +47,9 @@ utf8_convert_ret utf8_to_enc(const char *string, const char *encname,
|
||||
utf8_convert_ret utf8_from_enc(const char *string, const char *encname,
|
||||
size_t len, char **result);
|
||||
|
||||
utf8_convert_ret utf8_to_html(const char *string, const char *encname,
|
||||
size_t len, char **result);
|
||||
|
||||
/* These two are platform specific */
|
||||
utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
|
||||
char **result);
|
||||
|
Loading…
x
Reference in New Issue
Block a user