netsurf/desktop/save_text.c
Daniel Silverstone 6807b4208a Remove the netsurf/ from the include paths and rationalise use of <> vs "" in includes
NetSurf includes are now done with ""s and other system includes with <>s as C intended.
The scandeps tool has been updated to only look for ""ed includes, and to verify that the
files exist in the tree before adding them to the dependency lines. The depend rule has
therefore been augmented to make sure the autogenerated files are built before it is run.

This is untested under self-hosted RISC OS builds. All else tested and works.


svn path=/trunk/netsurf/; revision=3307
2007-05-30 22:39:54 +00:00

116 lines
2.4 KiB
C

/*
* This file is part of NetSurf, http://netsurf-browser.org/
* Licensed under the GNU General Public License,
* http://www.opensource.org/licenses/gpl-license
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
*/
#include <stdbool.h>
#include <string.h>
#include <libxml/HTMLtree.h>
#include "utils/config.h"
#include "content/content.h"
#include "desktop/save_text.h"
#include "utils/log.h"
#include "utils/utils.h"
#ifdef WITH_TEXT_EXPORT
static void extract_text(xmlDoc *doc);
static void extract_text_from_tree(xmlNode *n);
static FILE *out;
void save_as_text(struct content *c, char *path) {
htmlParserCtxtPtr toSave;
if (c->type != CONTENT_HTML) {
return;
}
out = fopen(path, "w");
if (!out) return;
toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
htmlParseDocument(toSave);
extract_text(toSave->myDoc);
fclose(out);
xmlFreeDoc(toSave->myDoc);
htmlFreeParserCtxt(toSave);
}
void extract_text(xmlDoc *doc)
{
xmlNode *html;
/* find the html element */
for (html = doc->children;
html!=0 && html->type != XML_ELEMENT_NODE;
html = html->next)
;
if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
return;
}
extract_text_from_tree(html);
}
void extract_text_from_tree(xmlNode *n)
{
xmlNode *this_node;
char *text;
int need_nl = 0;
if (n->type == XML_ELEMENT_NODE) {
if (strcmp(n->name, "dl") == 0 ||
strcmp(n->name, "h1") == 0 ||
strcmp(n->name, "h2") == 0 ||
strcmp(n->name, "h3") == 0 ||
strcmp(n->name, "ol") == 0 ||
strcmp(n->name, "title") == 0 ||
strcmp(n->name, "ul") == 0) {
need_nl = 2;
}
else if (strcmp(n->name, "applet") == 0 ||
strcmp(n->name, "br") == 0 ||
strcmp(n->name, "div") == 0 ||
strcmp(n->name, "dt") == 0 ||
strcmp(n->name, "h4") == 0 ||
strcmp(n->name, "h5") == 0 ||
strcmp(n->name, "h6") == 0 ||
strcmp(n->name, "li") == 0 ||
strcmp(n->name, "object") == 0 ||
strcmp(n->name, "p") == 0 ||
strcmp(n->name, "tr") == 0) {
need_nl = 1;
}
/* do nothing, we just recurse through these nodes */
}
else if (n->type == XML_TEXT_NODE) {
if ((text = squash_whitespace(n->content)) != NULL) {
fputs(text, out);
free(text);
}
return;
}
else {
return;
}
/* now recurse */
for (this_node = n->children; this_node != 0; this_node = this_node->next) {
extract_text_from_tree(this_node);
}
while (need_nl--)
fputc('\n', out);
}
#endif