2004-03-25 03:31:45 +03:00
|
|
|
/*
|
|
|
|
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
|
2008-04-13 22:21:22 +04:00
|
|
|
* Copyright 2008 Michael Drake <tlsa@netsurf-browser.org>
|
2007-08-08 20:16:03 +04:00
|
|
|
*
|
|
|
|
* This file is part of NetSurf, http://www.netsurf-browser.org/
|
|
|
|
*
|
|
|
|
* NetSurf is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation; version 2 of the License.
|
|
|
|
*
|
|
|
|
* NetSurf is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2004-03-25 03:31:45 +03:00
|
|
|
*/
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
/** \file
|
|
|
|
* Text export of HTML (implementation).
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <assert.h>
|
2004-03-25 03:31:45 +03:00
|
|
|
#include <stdbool.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
2012-03-25 00:55:22 +04:00
|
|
|
#include <dom/dom.h>
|
|
|
|
|
2007-05-31 02:39:54 +04:00
|
|
|
#include "utils/config.h"
|
2014-02-02 02:17:36 +04:00
|
|
|
#include "utils/log.h"
|
|
|
|
#include "utils/utf8.h"
|
|
|
|
#include "utils/utils.h"
|
2016-06-06 10:59:23 +03:00
|
|
|
#include "netsurf/content.h"
|
2018-05-10 13:34:26 +03:00
|
|
|
#include "html/box.h"
|
2018-05-11 15:15:17 +03:00
|
|
|
#include "html/html_save.h"
|
2014-02-02 02:17:36 +04:00
|
|
|
|
2016-05-30 13:20:15 +03:00
|
|
|
#include "netsurf/utf8.h"
|
2014-10-16 12:48:09 +04:00
|
|
|
#include "desktop/gui_internal.h"
|
2014-02-02 02:17:36 +04:00
|
|
|
#include "desktop/save_text.h"
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
static void extract_text(struct box *box, bool *first,
|
|
|
|
save_text_whitespace *before, struct save_text_state *save);
|
|
|
|
static bool save_text_add_to_buffer(const char *text, size_t length,
|
|
|
|
struct box *box, const char *whitespace_text,
|
|
|
|
size_t whitespace_length, struct save_text_state *save);
|
2004-03-25 03:31:45 +03:00
|
|
|
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
/**
|
2008-04-14 12:28:55 +04:00
|
|
|
* Extract the text from an HTML content and save it as a text file. Text is
|
|
|
|
* converted to the local encoding.
|
2008-04-13 22:21:22 +04:00
|
|
|
*
|
|
|
|
* \param c An HTML content.
|
|
|
|
* \param path Path to save text file too.
|
|
|
|
*/
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2016-06-06 10:59:23 +03:00
|
|
|
void save_as_text(struct hlcache_handle *c, char *path)
|
2008-04-13 22:21:22 +04:00
|
|
|
{
|
|
|
|
FILE *out;
|
|
|
|
struct save_text_state save = { NULL, 0, 0 };
|
|
|
|
save_text_whitespace before = WHITESPACE_NONE;
|
|
|
|
bool first = true;
|
2014-01-29 01:40:13 +04:00
|
|
|
nserror ret;
|
2008-04-13 22:21:22 +04:00
|
|
|
char *result;
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2010-03-28 16:56:39 +04:00
|
|
|
if (!c || content_get_type(c) != CONTENT_HTML) {
|
2004-03-25 03:31:45 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2010-03-28 16:56:39 +04:00
|
|
|
extract_text(html_get_box_tree(c), &first, &before, &save);
|
2008-04-13 22:21:22 +04:00
|
|
|
if (!save.block)
|
|
|
|
return;
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2014-02-02 02:17:36 +04:00
|
|
|
ret = guit->utf8->utf8_to_local(save.block, save.length, &result);
|
2008-04-13 22:21:22 +04:00
|
|
|
free(save.block);
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2014-01-29 01:40:13 +04:00
|
|
|
if (ret != NSERROR_OK) {
|
Use coccinelle to change logging macro calls in c files
for F in $(git ls-files '*.c');do spatch --sp-file foo.cocci --in-place ${F};done
@@ expression E; @@
-LOG(E);
+NSLOG(netsurf, INFO, E);
@@ expression E, E1; @@
-LOG(E, E1);
+NSLOG(netsurf, INFO, E, E1);
@@ expression E, E1, E2; @@
-LOG(E, E1, E2);
+NSLOG(netsurf, INFO, E, E1, E2);
@@ expression E, E1, E2, E3; @@
-LOG(E, E1, E2, E3);
+NSLOG(netsurf, INFO, E, E1, E2, E3);
@@ expression E, E1, E2, E3, E4; @@
-LOG(E, E1, E2, E3, E4);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4);
@@ expression E, E1, E2, E3, E4, E5; @@
-LOG(E, E1, E2, E3, E4, E5);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5);
@@ expression E, E1, E2, E3, E4, E5, E6; @@
-LOG(E, E1, E2, E3, E4, E5, E6);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6);
@@ expression E, E1, E2, E3, E4, E5, E6, E7; @@
-LOG(E, E1, E2, E3, E4, E5, E6, E7);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6, E7);
2017-09-06 20:28:12 +03:00
|
|
|
NSLOG(netsurf, INFO,
|
|
|
|
"failed to convert to local encoding, return %d", ret);
|
2008-04-13 22:21:22 +04:00
|
|
|
return;
|
|
|
|
}
|
2004-03-25 03:31:45 +03:00
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
out = fopen(path, "w");
|
|
|
|
if (out) {
|
|
|
|
int res = fputs(result, out);
|
2009-05-28 20:03:48 +04:00
|
|
|
|
|
|
|
if (res < 0) {
|
Use coccinelle to change logging macro calls in c files
for F in $(git ls-files '*.c');do spatch --sp-file foo.cocci --in-place ${F};done
@@ expression E; @@
-LOG(E);
+NSLOG(netsurf, INFO, E);
@@ expression E, E1; @@
-LOG(E, E1);
+NSLOG(netsurf, INFO, E, E1);
@@ expression E, E1, E2; @@
-LOG(E, E1, E2);
+NSLOG(netsurf, INFO, E, E1, E2);
@@ expression E, E1, E2, E3; @@
-LOG(E, E1, E2, E3);
+NSLOG(netsurf, INFO, E, E1, E2, E3);
@@ expression E, E1, E2, E3, E4; @@
-LOG(E, E1, E2, E3, E4);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4);
@@ expression E, E1, E2, E3, E4, E5; @@
-LOG(E, E1, E2, E3, E4, E5);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5);
@@ expression E, E1, E2, E3, E4, E5, E6; @@
-LOG(E, E1, E2, E3, E4, E5, E6);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6);
@@ expression E, E1, E2, E3, E4, E5, E6, E7; @@
-LOG(E, E1, E2, E3, E4, E5, E6, E7);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6, E7);
2017-09-06 20:28:12 +03:00
|
|
|
NSLOG(netsurf, INFO, "Warning: write failed");
|
2009-05-28 20:03:48 +04:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
res = fputs("\n", out);
|
2009-05-28 20:03:48 +04:00
|
|
|
if (res < 0) {
|
Use coccinelle to change logging macro calls in c files
for F in $(git ls-files '*.c');do spatch --sp-file foo.cocci --in-place ${F};done
@@ expression E; @@
-LOG(E);
+NSLOG(netsurf, INFO, E);
@@ expression E, E1; @@
-LOG(E, E1);
+NSLOG(netsurf, INFO, E, E1);
@@ expression E, E1, E2; @@
-LOG(E, E1, E2);
+NSLOG(netsurf, INFO, E, E1, E2);
@@ expression E, E1, E2, E3; @@
-LOG(E, E1, E2, E3);
+NSLOG(netsurf, INFO, E, E1, E2, E3);
@@ expression E, E1, E2, E3, E4; @@
-LOG(E, E1, E2, E3, E4);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4);
@@ expression E, E1, E2, E3, E4, E5; @@
-LOG(E, E1, E2, E3, E4, E5);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5);
@@ expression E, E1, E2, E3, E4, E5, E6; @@
-LOG(E, E1, E2, E3, E4, E5, E6);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6);
@@ expression E, E1, E2, E3, E4, E5, E6, E7; @@
-LOG(E, E1, E2, E3, E4, E5, E6, E7);
+NSLOG(netsurf, INFO, E, E1, E2, E3, E4, E5, E6, E7);
2017-09-06 20:28:12 +03:00
|
|
|
NSLOG(netsurf, INFO,
|
|
|
|
"Warning: failed writing trailing newline");
|
2009-05-28 20:03:48 +04:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
fclose(out);
|
|
|
|
}
|
2009-05-28 20:03:48 +04:00
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
free(result);
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Decide what whitespace to place before the next bit of content-related text
|
2008-04-14 12:28:55 +04:00
|
|
|
* that is saved. Any existing whitespace is overridden if the whitespace for
|
|
|
|
* this box is more "significant".
|
2008-04-13 22:21:22 +04:00
|
|
|
*
|
|
|
|
* \param box Pointer to box.
|
|
|
|
* \param first Whether this is before the first bit of content-related
|
|
|
|
* text to be saved.
|
|
|
|
* \param before Type of whitespace currently intended to be placed
|
|
|
|
* before the next bit of content-related text to be saved.
|
|
|
|
* Updated if this box is worthy of more significant
|
|
|
|
* whitespace.
|
|
|
|
* \param whitespace_text Whitespace to place before next bit of
|
|
|
|
* content-related text to be saved.
|
|
|
|
* Updated if this box is worthy of more significant
|
|
|
|
* whitespace.
|
|
|
|
* \param whitespace_length Length of whitespace_text.
|
|
|
|
* Updated if this box is worthy of more significant
|
|
|
|
* whitespace.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void save_text_solve_whitespace(struct box *box, bool *first,
|
|
|
|
save_text_whitespace *before, const char **whitespace_text,
|
|
|
|
size_t *whitespace_length)
|
2004-03-25 03:31:45 +03:00
|
|
|
{
|
2008-04-13 22:21:22 +04:00
|
|
|
/* work out what whitespace should be placed before the next bit of
|
|
|
|
* text */
|
|
|
|
if (*before < WHITESPACE_TWO_NEW_LINES &&
|
|
|
|
/* significant box type */
|
|
|
|
(box->type == BOX_BLOCK ||
|
|
|
|
box->type == BOX_TABLE ||
|
|
|
|
box->type == BOX_FLOAT_LEFT ||
|
|
|
|
box->type == BOX_FLOAT_RIGHT) &&
|
|
|
|
/* and not a list element */
|
|
|
|
!box->list_marker &&
|
|
|
|
/* and not a marker... */
|
|
|
|
(!(box->parent && box->parent->list_marker == box) ||
|
|
|
|
/* ...unless marker follows WHITESPACE_TAB */
|
|
|
|
((box->parent && box->parent->list_marker == box) &&
|
|
|
|
*before == WHITESPACE_TAB))) {
|
|
|
|
*before = WHITESPACE_TWO_NEW_LINES;
|
2009-11-22 16:37:28 +03:00
|
|
|
} else if (*before <= WHITESPACE_ONE_NEW_LINE &&
|
2008-04-13 22:21:22 +04:00
|
|
|
(box->type == BOX_TABLE_ROW ||
|
|
|
|
box->type == BOX_BR ||
|
|
|
|
(box->type != BOX_INLINE &&
|
|
|
|
(box->parent && box->parent->list_marker == box)) ||
|
2009-11-22 16:37:28 +03:00
|
|
|
(box->parent && box->parent->style &&
|
2009-07-24 03:05:34 +04:00
|
|
|
(css_computed_white_space(box->parent->style) ==
|
2008-04-13 22:21:22 +04:00
|
|
|
CSS_WHITE_SPACE_PRE ||
|
2009-07-24 03:05:34 +04:00
|
|
|
css_computed_white_space(box->parent->style) ==
|
2008-04-13 22:21:22 +04:00
|
|
|
CSS_WHITE_SPACE_PRE_WRAP) &&
|
|
|
|
box->type == BOX_INLINE_CONTAINER))) {
|
|
|
|
if (*before == WHITESPACE_ONE_NEW_LINE)
|
|
|
|
*before = WHITESPACE_TWO_NEW_LINES;
|
|
|
|
else
|
|
|
|
*before = WHITESPACE_ONE_NEW_LINE;
|
|
|
|
}
|
|
|
|
else if (*before < WHITESPACE_TAB &&
|
|
|
|
(box->type == BOX_TABLE_CELL ||
|
|
|
|
box->list_marker)) {
|
|
|
|
*before = WHITESPACE_TAB;
|
2004-05-22 17:45:20 +04:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
if (*first) {
|
|
|
|
/* before the first bit of text to be saved; there is
|
|
|
|
* no preceding whitespace */
|
|
|
|
*whitespace_text = "";
|
|
|
|
*whitespace_length = 0;
|
|
|
|
} else {
|
|
|
|
/* set the whitespace that has been decided on */
|
|
|
|
switch (*before) {
|
|
|
|
case WHITESPACE_TWO_NEW_LINES:
|
|
|
|
*whitespace_text = "\n\n";
|
|
|
|
*whitespace_length = 2;
|
|
|
|
break;
|
|
|
|
case WHITESPACE_ONE_NEW_LINE:
|
|
|
|
*whitespace_text = "\n";
|
|
|
|
*whitespace_length = 1;
|
|
|
|
break;
|
|
|
|
case WHITESPACE_TAB:
|
|
|
|
*whitespace_text = "\t";
|
|
|
|
*whitespace_length = 1;
|
|
|
|
break;
|
|
|
|
case WHITESPACE_NONE:
|
|
|
|
*whitespace_text = "";
|
|
|
|
*whitespace_length = 0;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
*whitespace_text = "";
|
|
|
|
*whitespace_length = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Traverse though the box tree and add all text to a save buffer.
|
|
|
|
*
|
|
|
|
* \param box Pointer to box.
|
|
|
|
* \param first Whether this is before the first bit of content-related
|
|
|
|
* text to be saved.
|
|
|
|
* \param before Type of whitespace currently intended to be placed
|
|
|
|
* before the next bit of content-related text to be saved.
|
|
|
|
* Updated if this box is worthy of more significant
|
|
|
|
* whitespace.
|
|
|
|
* \param save our save_text_state workspace pointer
|
|
|
|
* \return true iff the file writing succeeded and traversal should continue.
|
|
|
|
*/
|
|
|
|
|
|
|
|
void extract_text(struct box *box, bool *first, save_text_whitespace *before,
|
|
|
|
struct save_text_state *save)
|
2004-03-25 03:31:45 +03:00
|
|
|
{
|
2008-04-13 22:21:22 +04:00
|
|
|
struct box *child;
|
|
|
|
const char *whitespace_text = "";
|
|
|
|
size_t whitespace_length = 0;
|
|
|
|
|
|
|
|
assert(box);
|
|
|
|
|
|
|
|
/* If box has a list marker */
|
|
|
|
if (box->list_marker) {
|
|
|
|
/* do the marker box before continuing with the rest of the
|
|
|
|
* list element */
|
|
|
|
extract_text(box->list_marker, first, before, save);
|
2004-05-22 17:45:20 +04:00
|
|
|
}
|
2008-04-13 22:21:22 +04:00
|
|
|
|
|
|
|
/* read before calling the handler in case it modifies the tree */
|
|
|
|
child = box->children;
|
|
|
|
|
|
|
|
save_text_solve_whitespace(box, first, before, &whitespace_text,
|
|
|
|
&whitespace_length);
|
|
|
|
|
|
|
|
if (box->type != BOX_BR && !((box->type == BOX_FLOAT_LEFT ||
|
|
|
|
box->type == BOX_FLOAT_RIGHT) && !box->text) &&
|
|
|
|
box->length > 0 && box->text) {
|
|
|
|
/* Box meets criteria for export; add text to buffer */
|
|
|
|
save_text_add_to_buffer(box->text, box->length, box,
|
|
|
|
whitespace_text, whitespace_length, save);
|
|
|
|
*first = false;
|
|
|
|
*before = WHITESPACE_NONE;
|
2004-05-22 17:45:20 +04:00
|
|
|
}
|
2008-04-13 22:21:22 +04:00
|
|
|
|
|
|
|
/* Work though the children of this box, extracting any text */
|
|
|
|
while (child) {
|
|
|
|
extract_text(child, first, before, save);
|
|
|
|
child = child->next;
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Add text to save text buffer. Any preceding whitespace or following space is
|
|
|
|
* also added to the buffer.
|
|
|
|
*
|
|
|
|
* \param text Pointer to text being added.
|
|
|
|
* \param length Length of text to be appended (bytes).
|
|
|
|
* \param box Pointer to text box.
|
|
|
|
* \param whitespace_text Whitespace to place before text for formatting
|
|
|
|
* may be NULL.
|
|
|
|
* \param whitespace_length Length of whitespace_text.
|
|
|
|
* \param save Our save_text_state workspace pointer.
|
|
|
|
* \return true iff the file writing succeeded and traversal should continue.
|
|
|
|
*/
|
|
|
|
|
|
|
|
bool save_text_add_to_buffer(const char *text, size_t length, struct box *box,
|
|
|
|
const char *whitespace_text, size_t whitespace_length,
|
|
|
|
struct save_text_state *save)
|
|
|
|
{
|
|
|
|
size_t new_length;
|
|
|
|
int space = 0;
|
|
|
|
|
|
|
|
assert(save);
|
|
|
|
|
|
|
|
if (box->space > 0)
|
|
|
|
space = 1;
|
|
|
|
|
|
|
|
if (whitespace_text)
|
|
|
|
length += whitespace_length;
|
|
|
|
|
|
|
|
new_length = save->length + whitespace_length + length + space;
|
|
|
|
if (new_length >= save->alloc) {
|
|
|
|
size_t new_alloc = save->alloc + (save->alloc / 4);
|
|
|
|
char *new_block;
|
|
|
|
|
|
|
|
if (new_alloc < new_length) new_alloc = new_length;
|
|
|
|
|
|
|
|
new_block = realloc(save->block, new_alloc);
|
|
|
|
if (!new_block) return false;
|
|
|
|
|
|
|
|
save->block = new_block;
|
|
|
|
save->alloc = new_alloc;
|
|
|
|
}
|
|
|
|
if (whitespace_text) {
|
|
|
|
memcpy(save->block + save->length, whitespace_text,
|
|
|
|
whitespace_length);
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
2008-04-13 22:21:22 +04:00
|
|
|
memcpy(save->block + save->length + whitespace_length, text, length);
|
|
|
|
save->length += length;
|
|
|
|
|
|
|
|
if (space == 1)
|
|
|
|
save->block[save->length++] = ' ';
|
2004-05-22 17:45:20 +04:00
|
|
|
|
2008-04-13 22:21:22 +04:00
|
|
|
return true;
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|