Move new version of contrib/ xml into xml2, keep old version in /xml.

2004-03-05 03:57:58 +00:00 · 2004-03-05 03:57:58 +00:00 · 31f4b59a46
commit 31f4b59a46
parent adca025c9e
11 changed files with 751 additions and 0 deletions
--- a/contrib/README
+++ b/contrib/README
@ -217,5 +217,9 @@ vacuumlo -
 	by Peter T Mount <peter@retep.org.uk>

 xml -
+	Storing XML in PostgreSQL (obsolete version)
+	by John Gray <jgray@azuli.co.uk>
+
+xml2 -
 	Storing XML in PostgreSQL
 	by John Gray <jgray@azuli.co.uk>
--- a/contrib/xml/TODO
+++ b/contrib/xml/TODO
@ -0,0 +1,78 @@
+PGXML TODO List
+===============
+
+Some of these items still require much more thought! Since the first
+release, the XPath support has improved (because I'm no longer using a
+homemade algorithm!).
+
+1. Performance considerations
+
+At present each document is parsed to produce the DOM tree on every query.
+
+Pros: 
+	Easy
+	No persistent memory or storage allocation for parsed trees
+		(libxml docs suggest representation of a document might
+		 be 4 times the size of the text)
+
+Cons:
+	Slow/ CPU intensive to parse.
+	Makes it difficult for PLs to apply libxml manipulations to create
+		new documents or amend existing ones.
+
+
+2. XQuery 
+
+I'm not sure if the addition of XQuery would be best as a function or
+as a new front-end parser. This is one to think about, but with a
+decent implementation of XPath, one of the prerequisites is covered.
+
+3. DOM Interfaces
+
+Expose more aspects of the DOM to user functions/ PLs. This would
+allow a procedure in a PL to run some queries and then use exposed
+interfaces to libxml to create an XML document out of the query
+results. I accept the argument that this might be more properly
+performed on the client side.
+
+4. Returning sets of documents from XPath queries.
+
+Although the current implementation allows you to amalgamate the
+returned results into a single document, it's quite possible that
+you'd like to use the returned set of nodes as a source for FROM.
+ 
+Is there a good way to optimise/index the results of certain XPath
+operations to make them faster?:
+
+select docid, pgxml_xpath(document,'//site/location/text()','','') as location 
+where pgxml_xpath(document,'//site/name/text()','','') = 'Church Farm';
+
+and with multiple element occurences in a document?
+
+select d.docid, pgxml_xpath(d.document,'//site/location/text()','','') 
+from docstore d, 
+pgxml_xpaths('docstore','document','//feature/type/text()','docid') ft 
+where ft.key = d.docid and ft.value ='Limekiln';
+
+pgxml_xpaths params are relname, attrname, xpath, returnkey. It would
+return a set of two-element tuples (key,value) consisting of the value of
+returnkey, and the cdata value of the xpath. The XML document would be
+defined by relname and attrname.
+
+The pgxml_xpaths function could be the basis of a functional index,
+which could speed up the above query very substantially, working
+through the normal query planner mechanism.
+
+5. Return type support.
+
+Better support for returning e.g. numeric or boolean values. I need to
+get to grips with the returned data from libxml first.
+
+ 
+John Gray <jgray@azuli.co.uk> 16 August 2001
+
+
+
+
+
+
--- a/contrib/xml/pgxml.c
+++ b/contrib/xml/pgxml.c
@ -0,0 +1,352 @@
+/********************************************************
+ * Interface code to parse an XML document using expat
+ ********************************************************/
+
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "expat.h"
+#include "pgxml.h"
+
+/* Memory management - we make expat use standard pg MM */
+
+XML_Memory_Handling_Suite mhs;
+
+/* passthrough functions (palloc is a macro) */
+
+static void *
+pgxml_palloc(size_t size)
+{
+	return palloc(size);
+}
+
+static void *
+pgxml_repalloc(void *ptr, size_t size)
+{
+	return repalloc(ptr, size);
+}
+
+static void
+pgxml_pfree(void *ptr)
+{
+	return pfree(ptr);
+}
+
+static void
+pgxml_mhs_init()
+{
+	mhs.malloc_fcn = pgxml_palloc;
+	mhs.realloc_fcn = pgxml_repalloc;
+	mhs.free_fcn = pgxml_pfree;
+}
+
+static void
+pgxml_handler_init()
+{
+	/*
+	 * This code should set up the relevant handlers from  user-supplied
+	 * settings. Quite how these settings are made is another matter :)
+	 */
+}
+
+/* Returns true if document is well-formed */
+
+PG_FUNCTION_INFO_V1(pgxml_parse);
+
+Datum
+pgxml_parse(PG_FUNCTION_ARGS)
+{
+	/* called as pgxml_parse(document) */
+	XML_Parser	p;
+	text	   *t = PG_GETARG_TEXT_P(0);		/* document buffer */
+	int32		docsize = VARSIZE(t) - VARHDRSZ;
+
+	pgxml_mhs_init();
+
+	pgxml_handler_init();
+
+	p = XML_ParserCreate_MM(NULL, &mhs, NULL);
+	if (!p)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+				 errmsg("could not create expat parser")));
+		PG_RETURN_NULL();		/* seems appropriate if we couldn't parse */
+	}
+
+	if (!XML_Parse(p, (char *) VARDATA(t), docsize, 1))
+	{
+		/*
+		 * elog(WARNING, "Parse error at line %d:%s",
+		 * XML_GetCurrentLineNumber(p),
+		 * XML_ErrorString(XML_GetErrorCode(p)));
+		 */
+		XML_ParserFree(p);
+		PG_RETURN_BOOL(false);
+	}
+
+	XML_ParserFree(p);
+	PG_RETURN_BOOL(true);
+}
+
+/* XPath handling functions */
+
+/* XPath support here is for a very skeletal kind of XPath!
+   It was easy to program though... */
+
+/* This first is the core function that builds a result set. The
+   actual functions called by the user manipulate that result set
+   in various ways.
+*/
+
+static XPath_Results *
+build_xpath_results(text *doc, text *pathstr)
+{
+	XPath_Results *xpr;
+	char	   *res;
+	pgxml_udata *udata;
+	XML_Parser	p;
+	int32		docsize;
+
+	xpr = (XPath_Results *) palloc((sizeof(XPath_Results)));
+	memset((void *) xpr, 0, sizeof(XPath_Results));
+	xpr->rescount = 0;
+
+	docsize = VARSIZE(doc) - VARHDRSZ;
+
+	/* res isn't going to be the real return type, it is just a buffer */
+
+	res = (char *) palloc(docsize);
+	memset((void *) res, 0, docsize);
+
+	xpr->resbuf = res;
+
+	udata = (pgxml_udata *) palloc((sizeof(pgxml_udata)));
+	memset((void *) udata, 0, sizeof(pgxml_udata));
+
+	udata->currentpath[0] = '\0';
+	udata->textgrab = 0;
+
+	udata->path = (char *) palloc(VARSIZE(pathstr));
+	memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr) - VARHDRSZ);
+
+	udata->path[VARSIZE(pathstr) - VARHDRSZ] = '\0';
+
+	udata->resptr = res;
+	udata->reslen = 0;
+
+	udata->xpres = xpr;
+
+	/* Now fire up the parser */
+	pgxml_mhs_init();
+
+	p = XML_ParserCreate_MM(NULL, &mhs, NULL);
+	if (!p)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+				 errmsg("could not create expat parser")));
+		pfree(xpr);
+		pfree(udata->path);
+		pfree(udata);
+		pfree(res);
+		return NULL;
+	}
+	XML_SetUserData(p, (void *) udata);
+
+	/* Set the handlers */
+
+	XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler);
+	XML_SetCharacterDataHandler(p, pgxml_charhandler);
+
+	if (!XML_Parse(p, (char *) VARDATA(doc), docsize, 1))
+	{
+		/*
+		 * elog(WARNING, "Parse error at line %d:%s",
+		 * XML_GetCurrentLineNumber(p),
+		 * XML_ErrorString(XML_GetErrorCode(p)));
+		 */
+		XML_ParserFree(p);
+		pfree(xpr);
+		pfree(udata->path);
+		pfree(udata);
+
+		return NULL;
+	}
+
+	pfree(udata->path);
+	pfree(udata);
+	XML_ParserFree(p);
+	return xpr;
+}
+
+
+PG_FUNCTION_INFO_V1(pgxml_xpath);
+
+Datum
+pgxml_xpath(PG_FUNCTION_ARGS)
+{
+	/* called as pgxml_xpath(document,pathstr, index) for the moment */
+
+	XPath_Results *xpresults;
+	text	   *restext;
+
+	text	   *t = PG_GETARG_TEXT_P(0);		/* document buffer */
+	text	   *t2 = PG_GETARG_TEXT_P(1);
+	int32		ind = PG_GETARG_INT32(2) - 1;
+
+	xpresults = build_xpath_results(t, t2);
+
+	/*
+	 * This needs to be changed depending on the mechanism for returning
+	 * our set of results.
+	 */
+
+	if (xpresults == NULL)		/* parse error (not WF or parser failure) */
+		PG_RETURN_NULL();
+
+	if (ind >= (xpresults->rescount))
+		PG_RETURN_NULL();
+
+	restext = (text *) palloc(xpresults->reslens[ind] + VARHDRSZ);
+	memcpy(VARDATA(restext), xpresults->results[ind], xpresults->reslens[ind]);
+
+	VARATT_SIZEP(restext) = xpresults->reslens[ind] + VARHDRSZ;
+
+	pfree(xpresults->resbuf);
+	pfree(xpresults);
+
+	PG_RETURN_TEXT_P(restext);
+}
+
+
+static void
+pgxml_pathcompare(void *userData)
+{
+	char	   *matchpos;
+
+	matchpos = strstr(UD->currentpath, UD->path);
+
+	if (matchpos == NULL)
+	{							/* Should we have more logic here ? */
+		if (UD->textgrab)
+		{
+			UD->textgrab = 0;
+			pgxml_finalisegrabbedtext(userData);
+		}
+		return;
+	}
+
+	/*
+	 * OK, we have a match of some sort. Now we need to check that our
+	 * match is anchored to the *end* of the string AND that it is
+	 * immediately preceded by a '/'
+	 */
+
+	/*
+	 * This test wouldn't work if strlen (UD->path) overran the length of
+	 * the currentpath, but that's not possible because we got a match!
+	 */
+
+	if ((matchpos + strlen(UD->path))[0] == '\0')
+	{
+		if ((UD->path)[0] == '/')
+		{
+			if (matchpos == UD->currentpath)
+				UD->textgrab = 1;
+		}
+		else
+		{
+			if ((matchpos - 1)[0] == '/')
+				UD->textgrab = 1;
+		}
+	}
+}
+
+static void
+pgxml_starthandler(void *userData, const XML_Char * name,
+				   const XML_Char ** atts)
+{
+
+	char		sepstr[] = "/";
+
+	if ((strlen(name) + strlen(UD->currentpath)) > MAXPATHLENGTH - 2)
+		elog(WARNING, "path too long");
+	else
+	{
+		strncat(UD->currentpath, sepstr, 1);
+		strcat(UD->currentpath, name);
+	}
+	if (UD->textgrab)
+	{
+		/*
+		 * Depending on user preference, should we "reconstitute" the
+		 * element into the result text?
+		 */
+	}
+	else
+		pgxml_pathcompare(userData);
+}
+
+static void
+pgxml_endhandler(void *userData, const XML_Char * name)
+{
+	/*
+	 * Start by removing the current element off the end of the
+	 * currentpath
+	 */
+
+	char	   *sepptr;
+
+	sepptr = strrchr(UD->currentpath, '/');
+	if (sepptr == NULL)
+	{
+		/* internal error */
+		elog(ERROR, "did not find '/'");
+		sepptr = UD->currentpath;
+	}
+	if (strcmp(name, sepptr + 1) != 0)
+	{
+		elog(WARNING, "wanted [%s], got [%s]", sepptr, name);
+		/* unmatched entry, so do nothing */
+	}
+	else
+	{
+		sepptr[0] = '\0';		/* Chop that element off the end */
+	}
+
+	if (UD->textgrab)
+		pgxml_pathcompare(userData);
+
+}
+
+static void
+pgxml_charhandler(void *userData, const XML_Char * s, int len)
+{
+	if (UD->textgrab)
+	{
+		if (len > 0)
+		{
+			memcpy(UD->resptr, s, len);
+			UD->resptr += len;
+			UD->reslen += len;
+		}
+	}
+}
+
+/* Should I be using PG list types here? */
+
+static void
+pgxml_finalisegrabbedtext(void *userData)
+{
+	/* In res/reslen, we have a single result. */
+	UD->xpres->results[UD->xpres->rescount] = UD->resptr - UD->reslen;
+	UD->xpres->reslens[UD->xpres->rescount] = UD->reslen;
+	UD->reslen = 0;
+	UD->xpres->rescount++;
+
+	/*
+	 * This effectively concatenates all the results together but we do
+	 * know where one ends and the next begins
+	 */
+}
--- a/contrib/xml/pgxml.h
+++ b/contrib/xml/pgxml.h
@ -0,0 +1,42 @@
+/* Header for pg xml parser interface */
+
+static void *pgxml_palloc(size_t size);
+static void *pgxml_repalloc(void *ptr, size_t size);
+static void pgxml_pfree(void *ptr);
+static void pgxml_mhs_init();
+static void pgxml_handler_init();
+Datum		pgxml_parse(PG_FUNCTION_ARGS);
+Datum		pgxml_xpath(PG_FUNCTION_ARGS);
+static void pgxml_starthandler(void *userData, const XML_Char * name,
+				   const XML_Char ** atts);
+static void pgxml_endhandler(void *userData, const XML_Char * name);
+static void pgxml_charhandler(void *userData, const XML_Char * s, int len);
+static void pgxml_pathcompare(void *userData);
+static void pgxml_finalisegrabbedtext(void *userData);
+
+#define MAXPATHLENGTH 512
+#define MAXRESULTS 100
+
+
+typedef struct
+{
+	int			rescount;
+	char	   *results[MAXRESULTS];
+	int32		reslens[MAXRESULTS];
+	char	   *resbuf;			/* pointer to the result buffer for pfree */
+}	XPath_Results;
+
+
+
+typedef struct
+{
+	char		currentpath[MAXPATHLENGTH];
+	char	   *path;
+	int			textgrab;
+	char	   *resptr;
+	int32		reslen;
+	XPath_Results *xpres;
+}	pgxml_udata;
+
+
+#define UD ((pgxml_udata *) userData)
--- a/contrib/xml/pgxml_dom.c
+++ b/contrib/xml/pgxml_dom.c
@ -0,0 +1,265 @@
+/* Parser interface for DOM-based parser (libxml) rather than
+   stream-based SAX-type parser */
+
+#include "postgres.h"
+#include "fmgr.h"
+
+/* libxml includes */
+
+#include <libxml/xpath.h>
+#include <libxml/tree.h>
+#include <libxml/xmlmemory.h>
+
+/* declarations */
+
+static void *pgxml_palloc(size_t size);
+static void *pgxml_repalloc(void *ptr, size_t size);
+static void pgxml_pfree(void *ptr);
+static char *pgxml_pstrdup(const char *string);
+
+static void pgxml_parser_init();
+
+static xmlChar *pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlDocPtr doc,
+				   xmlChar * toptagname, xmlChar * septagname,
+				   int format);
+
+static xmlChar *pgxml_texttoxmlchar(text *textstring);
+
+
+Datum		pgxml_parse(PG_FUNCTION_ARGS);
+Datum		pgxml_xpath(PG_FUNCTION_ARGS);
+
+/* memory handling passthrough functions (e.g. palloc, pstrdup are
+   currently macros, and the others might become so...) */
+
+static void *
+pgxml_palloc(size_t size)
+{
+	return palloc(size);
+}
+
+static void *
+pgxml_repalloc(void *ptr, size_t size)
+{
+	return repalloc(ptr, size);
+}
+
+static void
+pgxml_pfree(void *ptr)
+{
+	return pfree(ptr);
+}
+
+static char *
+pgxml_pstrdup(const char *string)
+{
+	return pstrdup(string);
+}
+
+static void
+pgxml_parser_init()
+{
+	/*
+	 * This code should also set parser settings from  user-supplied info.
+	 * Quite how these settings are made is another matter :)
+	 */
+
+	xmlMemSetup(pgxml_pfree, pgxml_palloc, pgxml_repalloc, pgxml_pstrdup);
+	xmlInitParser();
+
+}
+
+
+/* Returns true if document is well-formed */
+
+PG_FUNCTION_INFO_V1(pgxml_parse);
+
+Datum
+pgxml_parse(PG_FUNCTION_ARGS)
+{
+	/* called as pgxml_parse(document) */
+	xmlDocPtr	doctree;
+	text	   *t = PG_GETARG_TEXT_P(0);		/* document buffer */
+	int32		docsize = VARSIZE(t) - VARHDRSZ;
+
+	pgxml_parser_init();
+
+	doctree = xmlParseMemory((char *) VARDATA(t), docsize);
+	if (doctree == NULL)
+	{
+		xmlCleanupParser();
+		PG_RETURN_BOOL(false);	/* i.e. not well-formed */
+	}
+	xmlCleanupParser();
+	xmlFreeDoc(doctree);
+	PG_RETURN_BOOL(true);
+}
+
+static xmlChar
+*
+pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
+				   xmlDocPtr doc,
+				   xmlChar * toptagname,
+				   xmlChar * septagname,
+				   int format)
+{
+	/* Function translates a nodeset into a text representation */
+
+	/*
+	 * iterates over each node in the set and calls xmlNodeDump to write
+	 * it to an xmlBuffer -from which an xmlChar * string is returned.
+	 */
+	/* each representation is surrounded by <tagname> ... </tagname> */
+	/* if format==0, add a newline between nodes?? */
+
+	xmlBufferPtr buf;
+	xmlChar    *result;
+	int			i;
+
+	buf = xmlBufferCreate();
+
+	if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
+	{
+		xmlBufferWriteChar(buf, "<");
+		xmlBufferWriteCHAR(buf, toptagname);
+		xmlBufferWriteChar(buf, ">");
+	}
+	if (nodeset != NULL)
+	{
+		for (i = 0; i < nodeset->nodeNr; i++)
+		{
+			if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
+			{
+				xmlBufferWriteChar(buf, "<");
+				xmlBufferWriteCHAR(buf, septagname);
+				xmlBufferWriteChar(buf, ">");
+			}
+			xmlNodeDump(buf, doc, nodeset->nodeTab[i], 1, (format == 2));
+
+			if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
+			{
+				xmlBufferWriteChar(buf, "</");
+				xmlBufferWriteCHAR(buf, septagname);
+				xmlBufferWriteChar(buf, ">");
+			}
+			if (format)
+				xmlBufferWriteChar(buf, "\n");
+		}
+	}
+
+	if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
+	{
+		xmlBufferWriteChar(buf, "</");
+		xmlBufferWriteCHAR(buf, toptagname);
+		xmlBufferWriteChar(buf, ">");
+	}
+	result = xmlStrdup(buf->content);
+	xmlBufferFree(buf);
+	return result;
+}
+
+static xmlChar *
+pgxml_texttoxmlchar(text *textstring)
+{
+	xmlChar    *res;
+	int32		txsize;
+
+	txsize = VARSIZE(textstring) - VARHDRSZ;
+	res = (xmlChar *) palloc(txsize + 1);
+	memcpy((char *) res, VARDATA(textstring), txsize);
+	res[txsize] = '\0';
+	return res;
+}
+
+
+PG_FUNCTION_INFO_V1(pgxml_xpath);
+
+Datum
+pgxml_xpath(PG_FUNCTION_ARGS)
+{
+	xmlDocPtr	doctree;
+	xmlXPathContextPtr ctxt;
+	xmlXPathObjectPtr res;
+	xmlChar    *xpath,
+			   *xpresstr,
+			   *toptag,
+			   *septag;
+	xmlXPathCompExprPtr comppath;
+
+	int32		docsize,
+				ressize;
+	text	   *t,
+			   *xpres;
+
+	t = PG_GETARG_TEXT_P(0);	/* document buffer */
+	xpath = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(1));	/* XPath expression */
+	toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
+	septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3));
+
+	docsize = VARSIZE(t) - VARHDRSZ;
+
+	pgxml_parser_init();
+
+	doctree = xmlParseMemory((char *) VARDATA(t), docsize);
+	if (doctree == NULL)
+	{							/* not well-formed */
+		xmlCleanupParser();
+		PG_RETURN_NULL();
+	}
+
+	ctxt = xmlXPathNewContext(doctree);
+	ctxt->node = xmlDocGetRootElement(doctree);
+
+	/* compile the path */
+	comppath = xmlXPathCompile(xpath);
+	if (comppath == NULL)
+	{
+		elog(WARNING, "XPath syntax error");
+		xmlFreeDoc(doctree);
+		pfree((void *) xpath);
+		xmlCleanupParser();
+		PG_RETURN_NULL();
+	}
+
+	/* Now evaluate the path expression. */
+	res = xmlXPathCompiledEval(comppath, ctxt);
+	xmlXPathFreeCompExpr(comppath);
+
+	if (res == NULL)
+	{
+		xmlFreeDoc(doctree);
+		pfree((void *) xpath);
+		xmlCleanupParser();
+		PG_RETURN_NULL();		/* seems appropriate */
+	}
+	/* now we dump this node, ?surrounding by tags? */
+	/* To do this, we look first at the type */
+	switch (res->type)
+	{
+		case XPATH_NODESET:
+			xpresstr = pgxmlNodeSetToText(res->nodesetval,
+										  doctree,
+										  toptag, septag, 0);
+			break;
+		case XPATH_STRING:
+			xpresstr = xmlStrdup(res->stringval);
+			break;
+		default:
+			elog(WARNING, "Unsupported XQuery result: %d", res->type);
+			xpresstr = xmlStrdup("<unsupported/>");
+	}
+
+
+	/* Now convert this result back to text */
+	ressize = strlen(xpresstr);
+	xpres = (text *) palloc(ressize + VARHDRSZ);
+	memcpy(VARDATA(xpres), xpresstr, ressize);
+	VARATT_SIZEP(xpres) = ressize + VARHDRSZ;
+
+	/* Free various storage */
+	xmlFreeDoc(doctree);
+	pfree((void *) xpath);
+	xmlFree(xpresstr);
+	xmlCleanupParser();
+	PG_RETURN_TEXT_P(xpres);
+}
--- a/contrib/xml/pgxml_dom.sql.in
+++ b/contrib/xml/pgxml_dom.sql.in
@ -0,0 +1,10 @@
+-- SQL for XML parser
+
+-- Adjust this setting to control where the objects get created.
+SET search_path TO public;
+
+CREATE OR REPLACE FUNCTION pgxml_parse(text) RETURNS boolean
+    AS 'MODULE_PATHNAME' LANGUAGE c STRICT;
+
+CREATE OR REPLACE FUNCTION pgxml_xpath(text, text, text, text) RETURNS text
+    AS 'MODULE_PATHNAME' LANGUAGE c STRICT;
--- a/contrib/xml2/Makefile
+++ b/contrib/xml2/Makefile
--- a/contrib/xml2/README.pgxml
+++ b/contrib/xml2/README.pgxml
--- a/contrib/xml2/pgxml.sql.in
+++ b/contrib/xml2/pgxml.sql.in
--- a/contrib/xml2/xpath.c
+++ b/contrib/xml2/xpath.c
--- a/contrib/xml2/xslt_proc.c
+++ b/contrib/xml2/xslt_proc.c