Tweak processing of multiple-index-scan plans to reduce overhead when

handling many-way scans: instead of re-evaluating all prior indexscan quals to see if a tuple has been fetched more than once, use a hash table indexed by tuple CTID. But fall back to the old way if the hash table grows to exceed SortMem.
2003-08-22 20:26:43 +00:00 · 2003-08-22 20:26:43 +00:00 · 92ee2528d8
commit 92ee2528d8
parent 38e2bf6283
2 changed files with 153 additions and 31 deletions
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.82 2003/08/04 02:39:59 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.83 2003/08/22 20:26:43 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -28,19 +28,51 @@
 #include "access/heapam.h"
 #include "executor/execdebug.h"
 #include "executor/nodeIndexscan.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "parser/parsetree.h"
-/* ----------------
+
 *		Misc stuff to move to executor.h soon -cim 6/5/90
 * ----------------
 */
 #define NO_OP			0
 #define LEFT_OP			1
 #define RIGHT_OP		2
 /*
 * In a multiple-index plan, we must take care to return any given tuple
 * only once, even if it matches conditions of several index scans.  Our
 * preferred way to do this is to record already-returned tuples in a hash
 * table (using the TID as unique identifier).  However, in a very large
 * scan this could conceivably run out of memory.  We limit the hash table
 * to no more than SortMem KB; if it grows past that, we fall back to the
 * pre-7.4 technique: evaluate the prior-scan index quals again for each
 * tuple (which is space-efficient, but slow).
 *
 * When scanning backwards, we use scannum to determine when to emit the
 * tuple --- we have to re-emit a tuple in the same scan as it was first
 * encountered.
 *
 * Note: this code would break if the planner were ever to create a multiple
 * index plan with overall backwards direction, because the hashtable code
 * will emit a tuple the first time it is encountered (which would be the
 * highest scan in which it matches the index), but the evaluate-the-quals
 * code will emit a tuple in the lowest-numbered scan in which it's valid.
 * This could be fixed at need by making the evaluate-the-quals case more
 * complex.  Currently the planner will never create such a plan (since it
 * considers multi-index plans unordered anyway), so there's no need for
 * more complexity.
 */
 typedef struct
 {
 	/* tid is the hash key and so must be first! */
 	ItemPointerData tid;		/* TID of a tuple we've returned */
 	int			scannum;		/* number of scan we returned it in */
 } DupHashTabEntry;
 static TupleTableSlot *IndexNext(IndexScanState *node);
 static void create_duphash(IndexScanState *node);
 /* ----------------------------------------------------------------
 *		IndexNext
@ -163,7 +195,7 @@ IndexNext(IndexScanState *node)
 		while ((tuple = index_getnext(scandesc, direction)) != NULL)
 		{
 			/*
-			 * store the scanned tuple in the scan tuple slot of the scan
+			 * Store the scanned tuple in the scan tuple slot of the scan
 			 * state.  Note: we pass 'false' because tuples returned by
 			 * amgetnext are pointers onto disk pages and must not be
 			 * pfree()'d.
@ -174,36 +206,80 @@ IndexNext(IndexScanState *node)
 						   false);		/* don't pfree */
 			/*
-			 * We must check to see if the current tuple was already
+			 * If it's a multiple-index scan, make sure not to double-report
-			 * matched by an earlier index, so we don't double-report it.
+			 * a tuple matched by more than one index.  (See notes above.)
 			 * We do this by passing the tuple through ExecQual and
 			 * checking for failure with all previous qualifications.
 			 */
-			if (node->iss_IndexPtr > 0)
+			if (numIndices > 1)
 			{
-				bool		prev_matches = false;
+				/* First try the hash table */
-				int			prev_index;
+				if (node->iss_DupHash)
-				List	   *qual;
+				{
 					DupHashTabEntry *entry;
 					bool	found;
-				econtext->ecxt_scantuple = slot;
+					entry = (DupHashTabEntry *)
-				ResetExprContext(econtext);
+						hash_search(node->iss_DupHash,
-				qual = node->indxqualorig;
+									&tuple->t_data->t_ctid,
-				for (prev_index = 0;
+									HASH_ENTER,
-					 prev_index < node->iss_IndexPtr;
+									&found);
-					 prev_index++)
+					if (entry == NULL ||
-				{
+						node->iss_DupHash->hctl->nentries > node->iss_MaxHash)
 					if (ExecQual((List *) lfirst(qual), econtext, false))
 					{
-						prev_matches = true;
+						/* out of memory (either hard or soft limit) */
-						break;
+						/* release hash table and fall thru to old code */
 						hash_destroy(node->iss_DupHash);
 						node->iss_DupHash = NULL;
 					}
 					else if (found)
 					{
 						/* pre-existing entry */
 						/*
 						 * It's duplicate if first emitted in a different
 						 * scan.  If same scan, we must be backing up, so
 						 * okay to emit again.
 						 */
 						if (entry->scannum != node->iss_IndexPtr)
 						{
 							/* Dup, so drop it and loop back for another */
 							ExecClearTuple(slot);
 							continue;
 						}
 					}
 					else
 					{
 						/* new entry, finish filling it in */
 						entry->scannum = node->iss_IndexPtr;
 					}
 					qual = lnext(qual);
 				}
-				if (prev_matches)
+				/* If hash table has overflowed, do it the hard way */
 				if (node->iss_DupHash == NULL &&
 					node->iss_IndexPtr > 0)
 				{
-					/* Duplicate, so drop it and loop back for another */
+					bool		prev_matches = false;
-					ExecClearTuple(slot);
+					int			prev_index;
-					continue;
+					List	   *qual;
 					econtext->ecxt_scantuple = slot;
 					ResetExprContext(econtext);
 					qual = node->indxqualorig;
 					for (prev_index = 0;
 						 prev_index < node->iss_IndexPtr;
 						 prev_index++)
 					{
 						if (ExecQual((List *) lfirst(qual), econtext, false))
 						{
 							prev_matches = true;
 							break;
 						}
 						qual = lnext(qual);
 					}
 					if (prev_matches)
 					{
 						/* Dup, so drop it and loop back for another */
 						ExecClearTuple(slot);
 						continue;
 					}
 				}
 			}
@ -383,6 +459,14 @@ ExecIndexReScan(IndexScanState *node, ExprContext *exprCtxt)
 		return;
 	}
 	/* reset hash table */
 	if (numIndices > 1)
 	{
 		if (node->iss_DupHash)
 			hash_destroy(node->iss_DupHash);
 		create_duphash(node);
 	}
 	/* reset index scans */
 	if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indxorderdir))
 		node->iss_IndexPtr = numIndices;
@ -432,6 +516,10 @@ ExecEndIndexScan(IndexScanState *node)
 	ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
 	/* drop hash table */
 	if (node->iss_DupHash)
 		hash_destroy(node->iss_DupHash);
 	/*
 	 * close the index relations
 	 */
@ -507,7 +595,7 @@ ExecIndexRestrPos(IndexScanState *node)
 /* ----------------------------------------------------------------
 *		ExecInitIndexScan
-  *
+ *
 *		Initializes the index scan's state information, creates
 *		scan keys, and opens the base and index relations.
 *
@ -919,12 +1007,42 @@ ExecInitIndexScan(IndexScan *node, EState *estate)
 	ExecAssignResultTypeFromTL(&indexstate->ss.ps);
 	ExecAssignScanProjectionInfo(&indexstate->ss);
 	/*
 	 * Initialize hash table if needed.
 	 */
 	if (numIndices > 1)
 		create_duphash(indexstate);
 	else
 		indexstate->iss_DupHash = NULL;
 	/*
 	 * all done.
 	 */
 	return indexstate;
 }
 static void
 create_duphash(IndexScanState *node)
 {
 	HASHCTL		hash_ctl;
 	MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 	hash_ctl.keysize = SizeOfIptrData;
 	hash_ctl.entrysize = sizeof(DupHashTabEntry);
 	hash_ctl.hash = tag_hash;
 	hash_ctl.hcxt = CurrentMemoryContext;
 	node->iss_DupHash = hash_create("DupHashTable",
 									(long) ceil(node->ss.ps.plan->plan_rows),
 									&hash_ctl,
 									HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 	if (node->iss_DupHash == NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 	node->iss_MaxHash = (SortMem * 1024L) /
 		(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(DupHashTabEntry)));
 }
 int
 ExecCountSlotsIndexScan(IndexScan *node)
 {
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $Id: execnodes.h,v 1.104 2003/08/19 01:13:41 tgl Exp $
+ * $Id: execnodes.h,v 1.105 2003/08/22 20:26:43 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -768,6 +768,8 @@ typedef ScanState SeqScanState;
 *		RuntimeKeysReady   true if runtime Skeys have been computed
 *		RelationDescs	   ptr to array of relation descriptors
 *		ScanDescs		   ptr to array of scan descriptors
 *		DupHash			   hashtable for recognizing dups in multiple scan
 *		MaxHash			   max # entries we will allow in hashtable
 * ----------------
 */
 typedef struct IndexScanState
@ -785,6 +787,8 @@ typedef struct IndexScanState
 	bool		iss_RuntimeKeysReady;
 	RelationPtr iss_RelationDescs;
 	IndexScanDescPtr iss_ScanDescs;
 	HTAB	   *iss_DupHash;
 	long		iss_MaxHash;
 } IndexScanState;
 /* ----------------