mirror of
https://github.com/netsurf-browser/netsurf
synced 2024-11-24 23:39:51 +03:00
6807b4208a
NetSurf includes are now done with ""s and other system includes with <>s as C intended. The scandeps tool has been updated to only look for ""ed includes, and to verify that the files exist in the tree before adding them to the dependency lines. The depend rule has therefore been augmented to make sure the autogenerated files are built before it is run. This is untested under self-hosted RISC OS builds. All else tested and works. svn path=/trunk/netsurf/; revision=3307
116 lines
2.4 KiB
C
116 lines
2.4 KiB
C
/*
|
|
* This file is part of NetSurf, http://netsurf-browser.org/
|
|
* Licensed under the GNU General Public License,
|
|
* http://www.opensource.org/licenses/gpl-license
|
|
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
|
|
*/
|
|
|
|
#include <stdbool.h>
|
|
#include <string.h>
|
|
|
|
#include <libxml/HTMLtree.h>
|
|
|
|
#include "utils/config.h"
|
|
#include "content/content.h"
|
|
#include "desktop/save_text.h"
|
|
#include "utils/log.h"
|
|
#include "utils/utils.h"
|
|
|
|
#ifdef WITH_TEXT_EXPORT
|
|
|
|
static void extract_text(xmlDoc *doc);
|
|
static void extract_text_from_tree(xmlNode *n);
|
|
|
|
static FILE *out;
|
|
|
|
void save_as_text(struct content *c, char *path) {
|
|
|
|
htmlParserCtxtPtr toSave;
|
|
|
|
if (c->type != CONTENT_HTML) {
|
|
return;
|
|
}
|
|
|
|
out = fopen(path, "w");
|
|
if (!out) return;
|
|
|
|
toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
|
|
htmlParseDocument(toSave);
|
|
|
|
extract_text(toSave->myDoc);
|
|
|
|
fclose(out);
|
|
|
|
xmlFreeDoc(toSave->myDoc);
|
|
htmlFreeParserCtxt(toSave);
|
|
}
|
|
|
|
void extract_text(xmlDoc *doc)
|
|
{
|
|
xmlNode *html;
|
|
|
|
/* find the html element */
|
|
for (html = doc->children;
|
|
html!=0 && html->type != XML_ELEMENT_NODE;
|
|
html = html->next)
|
|
;
|
|
if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
|
|
return;
|
|
}
|
|
|
|
extract_text_from_tree(html);
|
|
}
|
|
|
|
void extract_text_from_tree(xmlNode *n)
|
|
{
|
|
xmlNode *this_node;
|
|
char *text;
|
|
int need_nl = 0;
|
|
|
|
if (n->type == XML_ELEMENT_NODE) {
|
|
if (strcmp(n->name, "dl") == 0 ||
|
|
strcmp(n->name, "h1") == 0 ||
|
|
strcmp(n->name, "h2") == 0 ||
|
|
strcmp(n->name, "h3") == 0 ||
|
|
strcmp(n->name, "ol") == 0 ||
|
|
strcmp(n->name, "title") == 0 ||
|
|
strcmp(n->name, "ul") == 0) {
|
|
need_nl = 2;
|
|
}
|
|
else if (strcmp(n->name, "applet") == 0 ||
|
|
strcmp(n->name, "br") == 0 ||
|
|
strcmp(n->name, "div") == 0 ||
|
|
strcmp(n->name, "dt") == 0 ||
|
|
strcmp(n->name, "h4") == 0 ||
|
|
strcmp(n->name, "h5") == 0 ||
|
|
strcmp(n->name, "h6") == 0 ||
|
|
strcmp(n->name, "li") == 0 ||
|
|
strcmp(n->name, "object") == 0 ||
|
|
strcmp(n->name, "p") == 0 ||
|
|
strcmp(n->name, "tr") == 0) {
|
|
need_nl = 1;
|
|
}
|
|
/* do nothing, we just recurse through these nodes */
|
|
}
|
|
else if (n->type == XML_TEXT_NODE) {
|
|
if ((text = squash_whitespace(n->content)) != NULL) {
|
|
fputs(text, out);
|
|
free(text);
|
|
}
|
|
return;
|
|
}
|
|
else {
|
|
return;
|
|
}
|
|
|
|
/* now recurse */
|
|
for (this_node = n->children; this_node != 0; this_node = this_node->next) {
|
|
extract_text_from_tree(this_node);
|
|
}
|
|
|
|
while (need_nl--)
|
|
fputc('\n', out);
|
|
}
|
|
|
|
#endif
|