2004-03-25 03:31:45 +03:00
|
|
|
/*
|
2006-11-27 18:35:18 +03:00
|
|
|
* This file is part of NetSurf, http://netsurf-browser.org/
|
2004-03-25 03:31:45 +03:00
|
|
|
* Licensed under the GNU General Public License,
|
2004-05-22 17:45:20 +04:00
|
|
|
* http://www.opensource.org/licenses/gpl-license
|
2004-03-25 03:31:45 +03:00
|
|
|
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "libxml/HTMLtree.h"
|
|
|
|
|
|
|
|
#include "netsurf/utils/config.h"
|
|
|
|
#include "netsurf/content/content.h"
|
|
|
|
#include "netsurf/desktop/save_text.h"
|
|
|
|
#include "netsurf/utils/log.h"
|
|
|
|
#include "netsurf/utils/utils.h"
|
|
|
|
|
|
|
|
#ifdef WITH_TEXT_EXPORT
|
|
|
|
|
|
|
|
static void extract_text(xmlDoc *doc);
|
|
|
|
static void extract_text_from_tree(xmlNode *n);
|
|
|
|
|
|
|
|
static FILE *out;
|
|
|
|
|
|
|
|
void save_as_text(struct content *c, char *path) {
|
|
|
|
|
2004-05-22 17:45:20 +04:00
|
|
|
htmlParserCtxtPtr toSave;
|
2004-03-25 03:31:45 +03:00
|
|
|
|
|
|
|
if (c->type != CONTENT_HTML) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-05-22 17:45:20 +04:00
|
|
|
out = fopen(path, "w");
|
|
|
|
if (!out) return;
|
2004-03-25 03:31:45 +03:00
|
|
|
|
|
|
|
toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
|
|
|
|
htmlParseDocument(toSave);
|
|
|
|
|
|
|
|
extract_text(toSave->myDoc);
|
|
|
|
|
2004-05-22 17:45:20 +04:00
|
|
|
fclose(out);
|
2004-03-25 03:31:45 +03:00
|
|
|
|
|
|
|
xmlFreeDoc(toSave->myDoc);
|
|
|
|
htmlFreeParserCtxt(toSave);
|
|
|
|
}
|
|
|
|
|
|
|
|
void extract_text(xmlDoc *doc)
|
|
|
|
{
|
2004-05-22 17:45:20 +04:00
|
|
|
xmlNode *html;
|
|
|
|
|
|
|
|
/* find the html element */
|
|
|
|
for (html = doc->children;
|
|
|
|
html!=0 && html->type != XML_ELEMENT_NODE;
|
|
|
|
html = html->next)
|
|
|
|
;
|
|
|
|
if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
extract_text_from_tree(html);
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void extract_text_from_tree(xmlNode *n)
|
|
|
|
{
|
2004-05-22 17:45:20 +04:00
|
|
|
xmlNode *this_node;
|
2004-06-11 02:39:56 +04:00
|
|
|
char *text;
|
2004-05-22 17:45:20 +04:00
|
|
|
int need_nl = 0;
|
|
|
|
|
|
|
|
if (n->type == XML_ELEMENT_NODE) {
|
|
|
|
if (strcmp(n->name, "dl") == 0 ||
|
|
|
|
strcmp(n->name, "h1") == 0 ||
|
|
|
|
strcmp(n->name, "h2") == 0 ||
|
|
|
|
strcmp(n->name, "h3") == 0 ||
|
|
|
|
strcmp(n->name, "ol") == 0 ||
|
|
|
|
strcmp(n->name, "title") == 0 ||
|
|
|
|
strcmp(n->name, "ul") == 0) {
|
|
|
|
need_nl = 2;
|
|
|
|
}
|
|
|
|
else if (strcmp(n->name, "applet") == 0 ||
|
|
|
|
strcmp(n->name, "br") == 0 ||
|
|
|
|
strcmp(n->name, "div") == 0 ||
|
|
|
|
strcmp(n->name, "dt") == 0 ||
|
|
|
|
strcmp(n->name, "h4") == 0 ||
|
|
|
|
strcmp(n->name, "h5") == 0 ||
|
|
|
|
strcmp(n->name, "h6") == 0 ||
|
|
|
|
strcmp(n->name, "li") == 0 ||
|
|
|
|
strcmp(n->name, "object") == 0 ||
|
|
|
|
strcmp(n->name, "p") == 0 ||
|
|
|
|
strcmp(n->name, "tr") == 0) {
|
|
|
|
need_nl = 1;
|
|
|
|
}
|
|
|
|
/* do nothing, we just recurse through these nodes */
|
|
|
|
}
|
|
|
|
else if (n->type == XML_TEXT_NODE) {
|
2004-07-06 00:19:52 +04:00
|
|
|
if ((text = squash_whitespace(n->content)) != NULL) {
|
2004-05-22 17:45:20 +04:00
|
|
|
fputs(text, out);
|
|
|
|
free(text);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return;
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
2004-05-22 17:45:20 +04:00
|
|
|
/* now recurse */
|
|
|
|
for (this_node = n->children; this_node != 0; this_node = this_node->next) {
|
|
|
|
extract_text_from_tree(this_node);
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
2004-05-22 17:45:20 +04:00
|
|
|
|
|
|
|
while (need_nl--)
|
|
|
|
fputc('\n', out);
|
2004-03-25 03:31:45 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|