mirror of https://github.com/postgres/postgres
Improve efficiency of dblink by using libpq's new row processor API.
This patch provides a test case for libpq's row processor API. contrib/dblink can deal with very large result sets by dumping them into a tuplestore (which can spill to disk) --- but until now, the intermediate storage of the query result in a PGresult meant memory bloat for any large result. Now we use a row processor to convert the data to tuple form and dump it directly into the tuplestore. A limitation is that this only works for plain dblink() queries, not dblink_send_query() followed by dblink_get_result(). In the latter case we don't know the desired tuple rowtype soon enough. While hack solutions to that are possible, a different user-level API would probably be a better answer. Kyotaro Horiguchi, reviewed by Marko Kreen and Tom Lane
This commit is contained in:
parent
92785dac2e
commit
6f922ef88e
|
@ -63,12 +63,28 @@ typedef struct remoteConn
|
|||
bool newXactForCursor; /* Opened a transaction for a cursor */
|
||||
} remoteConn;
|
||||
|
||||
typedef struct storeInfo
|
||||
{
|
||||
FunctionCallInfo fcinfo;
|
||||
Tuplestorestate *tuplestore;
|
||||
AttInMetadata *attinmeta;
|
||||
MemoryContext tmpcontext;
|
||||
char **cstrs;
|
||||
} storeInfo;
|
||||
|
||||
/*
|
||||
* Internal declarations
|
||||
*/
|
||||
static Datum dblink_record_internal(FunctionCallInfo fcinfo, bool is_async);
|
||||
static void prepTuplestoreResult(FunctionCallInfo fcinfo);
|
||||
static void materializeResult(FunctionCallInfo fcinfo, PGresult *res);
|
||||
static void materializeQueryResult(FunctionCallInfo fcinfo,
|
||||
PGconn *conn,
|
||||
const char *conname,
|
||||
const char *sql,
|
||||
bool fail);
|
||||
static int storeHandler(PGresult *res, const PGdataValue *columns,
|
||||
const char **errmsgp, void *param);
|
||||
static remoteConn *getConnectionByName(const char *name);
|
||||
static HTAB *createConnHash(void);
|
||||
static void createNewConnection(const char *name, remoteConn *rconn);
|
||||
|
@ -629,100 +645,118 @@ dblink_get_result(PG_FUNCTION_ARGS)
|
|||
static Datum
|
||||
dblink_record_internal(FunctionCallInfo fcinfo, bool is_async)
|
||||
{
|
||||
char *msg;
|
||||
PGresult *res = NULL;
|
||||
PGconn *conn = NULL;
|
||||
char *connstr = NULL;
|
||||
char *sql = NULL;
|
||||
char *conname = NULL;
|
||||
remoteConn *rconn = NULL;
|
||||
bool fail = true; /* default to backward compatible */
|
||||
bool freeconn = false;
|
||||
PGconn *volatile conn = NULL;
|
||||
volatile bool freeconn = false;
|
||||
|
||||
prepTuplestoreResult(fcinfo);
|
||||
|
||||
DBLINK_INIT;
|
||||
|
||||
if (!is_async)
|
||||
PG_TRY();
|
||||
{
|
||||
if (PG_NARGS() == 3)
|
||||
char *msg;
|
||||
char *connstr = NULL;
|
||||
char *sql = NULL;
|
||||
char *conname = NULL;
|
||||
remoteConn *rconn = NULL;
|
||||
bool fail = true; /* default to backward compatible */
|
||||
|
||||
if (!is_async)
|
||||
{
|
||||
/* text,text,bool */
|
||||
DBLINK_GET_CONN;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
fail = PG_GETARG_BOOL(2);
|
||||
}
|
||||
else if (PG_NARGS() == 2)
|
||||
{
|
||||
/* text,text or text,bool */
|
||||
if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
|
||||
{
|
||||
conn = pconn->conn;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
|
||||
fail = PG_GETARG_BOOL(1);
|
||||
}
|
||||
else
|
||||
if (PG_NARGS() == 3)
|
||||
{
|
||||
/* text,text,bool */
|
||||
DBLINK_GET_CONN;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
fail = PG_GETARG_BOOL(2);
|
||||
}
|
||||
else if (PG_NARGS() == 2)
|
||||
{
|
||||
/* text,text or text,bool */
|
||||
if (get_fn_expr_argtype(fcinfo->flinfo, 1) == BOOLOID)
|
||||
{
|
||||
conn = pconn->conn;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
|
||||
fail = PG_GETARG_BOOL(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
DBLINK_GET_CONN;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(1));
|
||||
}
|
||||
}
|
||||
else if (PG_NARGS() == 1)
|
||||
{
|
||||
/* text */
|
||||
conn = pconn->conn;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
|
||||
}
|
||||
else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "wrong number of arguments");
|
||||
}
|
||||
else /* is_async */
|
||||
{
|
||||
/* get async result */
|
||||
if (PG_NARGS() == 2)
|
||||
{
|
||||
/* text,bool */
|
||||
DBLINK_GET_NAMED_CONN;
|
||||
fail = PG_GETARG_BOOL(1);
|
||||
}
|
||||
else if (PG_NARGS() == 1)
|
||||
{
|
||||
/* text */
|
||||
DBLINK_GET_NAMED_CONN;
|
||||
}
|
||||
else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "wrong number of arguments");
|
||||
}
|
||||
|
||||
if (!conn)
|
||||
DBLINK_CONN_NOT_AVAIL;
|
||||
|
||||
if (!is_async)
|
||||
{
|
||||
/* synchronous query, use efficient tuple collection method */
|
||||
materializeQueryResult(fcinfo, conn, conname, sql, fail);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* async result retrieval, do it the old way */
|
||||
PGresult *res = PQgetResult(conn);
|
||||
|
||||
/* NULL means we're all done with the async results */
|
||||
if (res)
|
||||
{
|
||||
if (PQresultStatus(res) != PGRES_COMMAND_OK &&
|
||||
PQresultStatus(res) != PGRES_TUPLES_OK)
|
||||
{
|
||||
dblink_res_error(conname, res, "could not execute query",
|
||||
fail);
|
||||
/* if fail isn't set, we'll return an empty query result */
|
||||
}
|
||||
else
|
||||
{
|
||||
materializeResult(fcinfo, res);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (PG_NARGS() == 1)
|
||||
{
|
||||
/* text */
|
||||
conn = pconn->conn;
|
||||
sql = text_to_cstring(PG_GETARG_TEXT_PP(0));
|
||||
}
|
||||
else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "wrong number of arguments");
|
||||
}
|
||||
else /* is_async */
|
||||
PG_CATCH();
|
||||
{
|
||||
/* get async result */
|
||||
if (PG_NARGS() == 2)
|
||||
{
|
||||
/* text,bool */
|
||||
DBLINK_GET_NAMED_CONN;
|
||||
fail = PG_GETARG_BOOL(1);
|
||||
}
|
||||
else if (PG_NARGS() == 1)
|
||||
{
|
||||
/* text */
|
||||
DBLINK_GET_NAMED_CONN;
|
||||
}
|
||||
else
|
||||
/* shouldn't happen */
|
||||
elog(ERROR, "wrong number of arguments");
|
||||
/* if needed, close the connection to the database */
|
||||
if (freeconn)
|
||||
PQfinish(conn);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
if (!conn)
|
||||
DBLINK_CONN_NOT_AVAIL;
|
||||
|
||||
/* synchronous query, or async result retrieval */
|
||||
if (!is_async)
|
||||
res = PQexec(conn, sql);
|
||||
else
|
||||
{
|
||||
res = PQgetResult(conn);
|
||||
/* NULL means we're all done with the async results */
|
||||
if (!res)
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
/* if needed, close the connection to the database and cleanup */
|
||||
/* if needed, close the connection to the database */
|
||||
if (freeconn)
|
||||
PQfinish(conn);
|
||||
|
||||
if (!res ||
|
||||
(PQresultStatus(res) != PGRES_COMMAND_OK &&
|
||||
PQresultStatus(res) != PGRES_TUPLES_OK))
|
||||
{
|
||||
dblink_res_error(conname, res, "could not execute query", fail);
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
materializeResult(fcinfo, res);
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
|
@ -890,6 +924,259 @@ materializeResult(FunctionCallInfo fcinfo, PGresult *res)
|
|||
PG_END_TRY();
|
||||
}
|
||||
|
||||
/*
|
||||
* Execute the given SQL command and store its results into a tuplestore
|
||||
* to be returned as the result of the current function.
|
||||
* This is equivalent to PQexec followed by materializeResult, but we make
|
||||
* use of libpq's "row processor" API to reduce per-row overhead.
|
||||
*/
|
||||
static void
|
||||
materializeQueryResult(FunctionCallInfo fcinfo,
|
||||
PGconn *conn,
|
||||
const char *conname,
|
||||
const char *sql,
|
||||
bool fail)
|
||||
{
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
|
||||
PGresult *volatile res = NULL;
|
||||
storeInfo sinfo;
|
||||
|
||||
/* prepTuplestoreResult must have been called previously */
|
||||
Assert(rsinfo->returnMode == SFRM_Materialize);
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
/* initialize storeInfo to empty */
|
||||
memset(&sinfo, 0, sizeof(sinfo));
|
||||
sinfo.fcinfo = fcinfo;
|
||||
|
||||
/* We'll collect tuples using storeHandler */
|
||||
PQsetRowProcessor(conn, storeHandler, &sinfo);
|
||||
|
||||
res = PQexec(conn, sql);
|
||||
|
||||
/* We don't keep the custom row processor installed permanently */
|
||||
PQsetRowProcessor(conn, NULL, NULL);
|
||||
|
||||
if (!res ||
|
||||
(PQresultStatus(res) != PGRES_COMMAND_OK &&
|
||||
PQresultStatus(res) != PGRES_TUPLES_OK))
|
||||
{
|
||||
/*
|
||||
* dblink_res_error will clear the passed PGresult, so we need
|
||||
* this ugly dance to avoid doing so twice during error exit
|
||||
*/
|
||||
PGresult *res1 = res;
|
||||
|
||||
res = NULL;
|
||||
dblink_res_error(conname, res1, "could not execute query", fail);
|
||||
/* if fail isn't set, we'll return an empty query result */
|
||||
}
|
||||
else if (PQresultStatus(res) == PGRES_COMMAND_OK)
|
||||
{
|
||||
/*
|
||||
* storeHandler didn't get called, so we need to convert the
|
||||
* command status string to a tuple manually
|
||||
*/
|
||||
TupleDesc tupdesc;
|
||||
AttInMetadata *attinmeta;
|
||||
Tuplestorestate *tupstore;
|
||||
HeapTuple tuple;
|
||||
char *values[1];
|
||||
MemoryContext oldcontext;
|
||||
|
||||
/*
|
||||
* need a tuple descriptor representing one TEXT column to return
|
||||
* the command status string as our result tuple
|
||||
*/
|
||||
tupdesc = CreateTemplateTupleDesc(1, false);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "status",
|
||||
TEXTOID, -1, 0);
|
||||
attinmeta = TupleDescGetAttInMetadata(tupdesc);
|
||||
|
||||
oldcontext = MemoryContextSwitchTo(
|
||||
rsinfo->econtext->ecxt_per_query_memory);
|
||||
tupstore = tuplestore_begin_heap(true, false, work_mem);
|
||||
rsinfo->setResult = tupstore;
|
||||
rsinfo->setDesc = tupdesc;
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
values[0] = PQcmdStatus(res);
|
||||
|
||||
/* build the tuple and put it into the tuplestore. */
|
||||
tuple = BuildTupleFromCStrings(attinmeta, values);
|
||||
tuplestore_puttuple(tupstore, tuple);
|
||||
|
||||
PQclear(res);
|
||||
}
|
||||
else
|
||||
{
|
||||
Assert(PQresultStatus(res) == PGRES_TUPLES_OK);
|
||||
/* storeHandler should have created a tuplestore */
|
||||
Assert(rsinfo->setResult != NULL);
|
||||
|
||||
PQclear(res);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/* be sure to unset the custom row processor */
|
||||
PQsetRowProcessor(conn, NULL, NULL);
|
||||
/* be sure to release any libpq result we collected */
|
||||
if (res)
|
||||
PQclear(res);
|
||||
/* and clear out any pending data in libpq */
|
||||
while ((res = PQskipResult(conn)) != NULL)
|
||||
PQclear(res);
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
}
|
||||
|
||||
/*
|
||||
* Custom row processor for materializeQueryResult.
|
||||
* Prototype of this function must match PQrowProcessor.
|
||||
*/
|
||||
static int
|
||||
storeHandler(PGresult *res, const PGdataValue *columns,
|
||||
const char **errmsgp, void *param)
|
||||
{
|
||||
storeInfo *sinfo = (storeInfo *) param;
|
||||
int nfields = PQnfields(res);
|
||||
char **cstrs = sinfo->cstrs;
|
||||
HeapTuple tuple;
|
||||
char *pbuf;
|
||||
int pbuflen;
|
||||
int i;
|
||||
MemoryContext oldcontext;
|
||||
|
||||
if (columns == NULL)
|
||||
{
|
||||
/* Prepare for new result set */
|
||||
ReturnSetInfo *rsinfo = (ReturnSetInfo *) sinfo->fcinfo->resultinfo;
|
||||
TupleDesc tupdesc;
|
||||
|
||||
/*
|
||||
* It's possible to get more than one result set if the query string
|
||||
* contained multiple SQL commands. In that case, we follow PQexec's
|
||||
* traditional behavior of throwing away all but the last result.
|
||||
*/
|
||||
if (sinfo->tuplestore)
|
||||
tuplestore_end(sinfo->tuplestore);
|
||||
sinfo->tuplestore = NULL;
|
||||
|
||||
/* get a tuple descriptor for our result type */
|
||||
switch (get_call_result_type(sinfo->fcinfo, NULL, &tupdesc))
|
||||
{
|
||||
case TYPEFUNC_COMPOSITE:
|
||||
/* success */
|
||||
break;
|
||||
case TYPEFUNC_RECORD:
|
||||
/* failed to determine actual type of RECORD */
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("function returning record called in context "
|
||||
"that cannot accept type record")));
|
||||
break;
|
||||
default:
|
||||
/* result type isn't composite */
|
||||
elog(ERROR, "return type must be a row type");
|
||||
break;
|
||||
}
|
||||
|
||||
/* make sure we have a persistent copy of the tupdesc */
|
||||
tupdesc = CreateTupleDescCopy(tupdesc);
|
||||
|
||||
/* check result and tuple descriptor have the same number of columns */
|
||||
if (nfields != tupdesc->natts)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_DATATYPE_MISMATCH),
|
||||
errmsg("remote query result rowtype does not match "
|
||||
"the specified FROM clause rowtype")));
|
||||
|
||||
/* Prepare attinmeta for later data conversions */
|
||||
sinfo->attinmeta = TupleDescGetAttInMetadata(tupdesc);
|
||||
|
||||
/* Create a new, empty tuplestore */
|
||||
oldcontext = MemoryContextSwitchTo(
|
||||
rsinfo->econtext->ecxt_per_query_memory);
|
||||
sinfo->tuplestore = tuplestore_begin_heap(true, false, work_mem);
|
||||
rsinfo->setResult = sinfo->tuplestore;
|
||||
rsinfo->setDesc = tupdesc;
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
/*
|
||||
* Set up sufficiently-wide string pointers array; this won't change
|
||||
* in size so it's easy to preallocate.
|
||||
*/
|
||||
if (sinfo->cstrs)
|
||||
pfree(sinfo->cstrs);
|
||||
sinfo->cstrs = (char **) palloc(nfields * sizeof(char *));
|
||||
|
||||
/* Create short-lived memory context for data conversions */
|
||||
if (!sinfo->tmpcontext)
|
||||
sinfo->tmpcontext =
|
||||
AllocSetContextCreate(CurrentMemoryContext,
|
||||
"dblink temporary context",
|
||||
ALLOCSET_DEFAULT_MINSIZE,
|
||||
ALLOCSET_DEFAULT_INITSIZE,
|
||||
ALLOCSET_DEFAULT_MAXSIZE);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/*
|
||||
* Do the following work in a temp context that we reset after each tuple.
|
||||
* This cleans up not only the data we have direct access to, but any
|
||||
* cruft the I/O functions might leak.
|
||||
*/
|
||||
oldcontext = MemoryContextSwitchTo(sinfo->tmpcontext);
|
||||
|
||||
/*
|
||||
* The strings passed to us are not null-terminated, but the datatype
|
||||
* input functions we're about to call require null termination. Copy the
|
||||
* strings and add null termination. As a micro-optimization, allocate
|
||||
* all the strings with one palloc.
|
||||
*/
|
||||
pbuflen = nfields; /* count the null terminators themselves */
|
||||
for (i = 0; i < nfields; i++)
|
||||
{
|
||||
int len = columns[i].len;
|
||||
|
||||
if (len > 0)
|
||||
pbuflen += len;
|
||||
}
|
||||
pbuf = (char *) palloc(pbuflen);
|
||||
|
||||
for (i = 0; i < nfields; i++)
|
||||
{
|
||||
int len = columns[i].len;
|
||||
|
||||
if (len < 0)
|
||||
cstrs[i] = NULL;
|
||||
else
|
||||
{
|
||||
cstrs[i] = pbuf;
|
||||
memcpy(pbuf, columns[i].value, len);
|
||||
pbuf += len;
|
||||
*pbuf++ = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
/* Convert row to a tuple, and add it to the tuplestore */
|
||||
tuple = BuildTupleFromCStrings(sinfo->attinmeta, cstrs);
|
||||
|
||||
tuplestore_puttuple(sinfo->tuplestore, tuple);
|
||||
|
||||
/* Clean up */
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
MemoryContextReset(sinfo->tmpcontext);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* List all open dblink connections by name.
|
||||
* Returns an array of all connection names.
|
||||
|
|
|
@ -425,14 +425,6 @@ SELECT *
|
|||
<refsect1>
|
||||
<title>Notes</title>
|
||||
|
||||
<para>
|
||||
<function>dblink</> fetches the entire remote query result before
|
||||
returning any of it to the local system. If the query is expected
|
||||
to return a large number of rows, it's better to open it as a cursor
|
||||
with <function>dblink_open</> and then fetch a manageable number
|
||||
of rows at a time.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
A convenient way to use <function>dblink</> with predetermined
|
||||
queries is to create a view.
|
||||
|
@ -1432,6 +1424,18 @@ dblink_get_result(text connname [, bool fail_on_error]) returns setof record
|
|||
sent, and one additional time to obtain an empty set result,
|
||||
before the connection can be used again.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
When using <function>dblink_send_query</> and
|
||||
<function>dblink_get_result</>, <application>dblink</> fetches the entire
|
||||
remote query result before returning any of it to the local query
|
||||
processor. If the query returns a large number of rows, this can result
|
||||
in transient memory bloat in the local session. It may be better to open
|
||||
such a query as a cursor with <function>dblink_open</> and then fetch a
|
||||
manageable number of rows at a time. Alternatively, use plain
|
||||
<function>dblink()</>, which avoids memory bloat by spooling large result
|
||||
sets to disk.
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
|
|
Loading…
Reference in New Issue