Add new function WaitForParallelWorkersToAttach.

Once this function has been called, we know that all workers have started and attached to their error queues -- so if any of them subsequently exit uncleanly, we'll be sure to throw an ERROR promptly. Otherwise, users of the ParallelContext machinery must be careful not to wait forever for a worker that has failed to start. Parallel query manages to work without needing this for reasons explained in new comments added by this patch, but it's a useful primitive for other parallel operations, such as the pending patch to make creating a btree index run in parallel. Amit Kapila, revised by me. Additional review by Peter Geoghegan. Discussion: http://postgr.es/m/CAA4eK1+e2MzyouF5bg=OtyhDSX+=Ao=3htN=T-r_6s3gCtKFiw@mail.gmail.com
2018-02-02 09:00:59 -05:00 · 2018-02-02 09:00:59 -05:00 · 9222c0d9ed
commit 9222c0d9ed
parent a2a2205761
4 changed files with 163 additions and 11 deletions
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@ -437,10 +437,11 @@ ReinitializeParallelDSM(ParallelContext *pcxt)
 		WaitForParallelWorkersToFinish(pcxt);
 		WaitForParallelWorkersToExit(pcxt);
 		pcxt->nworkers_launched = 0;
-		if (pcxt->any_message_received)
+		if (pcxt->known_attached_workers)
 		{
-			pfree(pcxt->any_message_received);
-			pcxt->any_message_received = NULL;
+			pfree(pcxt->known_attached_workers);
+			pcxt->known_attached_workers = NULL;
+			pcxt->nknown_attached_workers = 0;
 		}
 	}

@ -542,16 +543,147 @@ LaunchParallelWorkers(ParallelContext *pcxt)

 	/*
 	 * Now that nworkers_launched has taken its final value, we can initialize
-	 * any_message_received.
+	 * known_attached_workers.
 	 */
 	if (pcxt->nworkers_launched > 0)
-		pcxt->any_message_received =
+	{
+		pcxt->known_attached_workers =
 			palloc0(sizeof(bool) * pcxt->nworkers_launched);
+		pcxt->nknown_attached_workers = 0;
+	}

 	/* Restore previous memory context. */
 	MemoryContextSwitchTo(oldcontext);
 }

+/*
+ * Wait for all workers to attach to their error queues, and throw an error if
+ * any worker fails to do this.
+ *
+ * Callers can assume that if this function returns successfully, then the
+ * number of workers given by pcxt->nworkers_launched have initialized and
+ * attached to their error queues.  Whether or not these workers are guaranteed
+ * to still be running depends on what code the caller asked them to run;
+ * this function does not guarantee that they have not exited.  However, it
+ * does guarantee that any workers which exited must have done so cleanly and
+ * after successfully performing the work with which they were tasked.
+ *
+ * If this function is not called, then some of the workers that were launched
+ * may not have been started due to a fork() failure, or may have exited during
+ * early startup prior to attaching to the error queue, so nworkers_launched
+ * cannot be viewed as completely reliable.  It will never be less than the
+ * number of workers which actually started, but it might be more.  Any workers
+ * that failed to start will still be discovered by
+ * WaitForParallelWorkersToFinish and an error will be thrown at that time,
+ * provided that function is eventually reached.
+ *
+ * In general, the leader process should do as much work as possible before
+ * calling this function.  fork() failures and other early-startup failures
+ * are very uncommon, and having the leader sit idle when it could be doing
+ * useful work is undesirable.  However, if the leader needs to wait for
+ * all of its workers or for a specific worker, it may want to call this
+ * function before doing so.  If not, it must make some other provision for
+ * the failure-to-start case, lest it wait forever.  On the other hand, a
+ * leader which never waits for a worker that might not be started yet, or
+ * at least never does so prior to WaitForParallelWorkersToFinish(), need not
+ * call this function at all.
+ */
+void
+WaitForParallelWorkersToAttach(ParallelContext *pcxt)
+{
+	int			i;
+
+	/* Skip this if we have no launched workers. */
+	if (pcxt->nworkers_launched == 0)
+		return;
+
+	for (;;)
+	{
+		/*
+		 * This will process any parallel messages that are pending and it may
+		 * also throw an error propagated from a worker.
+		 */
+		CHECK_FOR_INTERRUPTS();
+
+		for (i = 0; i < pcxt->nworkers_launched; ++i)
+		{
+			BgwHandleStatus status;
+			shm_mq	   *mq;
+			int			rc;
+			pid_t		pid;
+
+			if (pcxt->known_attached_workers[i])
+				continue;
+
+			/*
+			 * If error_mqh is NULL, then the worker has already exited
+			 * cleanly.
+			 */
+			if (pcxt->worker[i].error_mqh == NULL)
+			{
+				pcxt->known_attached_workers[i] = true;
+				++pcxt->nknown_attached_workers;
+				continue;
+			}
+
+			status = GetBackgroundWorkerPid(pcxt->worker[i].bgwhandle, &pid);
+			if (status == BGWH_STARTED)
+			{
+				/* Has the worker attached to the error queue? */
+				mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+				if (shm_mq_get_sender(mq) != NULL)
+				{
+					/* Yes, so it is known to be attached. */
+					pcxt->known_attached_workers[i] = true;
+					++pcxt->nknown_attached_workers;
+				}
+			}
+			else if (status == BGWH_STOPPED)
+			{
+				/*
+				 * If the worker stopped without attaching to the error queue,
+				 * throw an error.
+				 */
+				mq = shm_mq_get_queue(pcxt->worker[i].error_mqh);
+				if (shm_mq_get_sender(mq) == NULL)
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("parallel worker failed to initialize"),
+							 errhint("More details may be available in the server log.")));
+
+				pcxt->known_attached_workers[i] = true;
+				++pcxt->nknown_attached_workers;
+			}
+			else
+			{
+				/*
+				 * Worker not yet started, so we must wait.  The postmaster
+				 * will notify us if the worker's state changes.  Our latch
+				 * might also get set for some other reason, but if so we'll
+				 * just end up waiting for the same worker again.
+				 */
+				rc = WaitLatch(MyLatch,
+							   WL_LATCH_SET | WL_POSTMASTER_DEATH,
+							   -1, WAIT_EVENT_BGWORKER_STARTUP);
+
+				/* emergency bailout if postmaster has died */
+				if (rc & WL_POSTMASTER_DEATH)
+					proc_exit(1);
+
+				if (rc & WL_LATCH_SET)
+					ResetLatch(MyLatch);
+			}
+		}
+
+		/* If all workers are known to have started, we're done. */
+		if (pcxt->nknown_attached_workers >= pcxt->nworkers_launched)
+		{
+			Assert(pcxt->nknown_attached_workers == pcxt->nworkers_launched);
+			break;
+		}
+	}
+}
+
 /*
 * Wait for all workers to finish computing.
 *
@ -589,7 +721,7 @@ WaitForParallelWorkersToFinish(ParallelContext *pcxt)
 			 */
 			if (pcxt->worker[i].error_mqh == NULL)
 				++nfinished;
-			else if (pcxt->any_message_received[i])
+			else if (pcxt->known_attached_workers[i])
 			{
 				anyone_alive = true;
 				break;
@ -909,8 +1041,12 @@ HandleParallelMessage(ParallelContext *pcxt, int i, StringInfo msg)
 {
 	char		msgtype;

-	if (pcxt->any_message_received != NULL)
-		pcxt->any_message_received[i] = true;
+	if (pcxt->known_attached_workers != NULL &&
+		!pcxt->known_attached_workers[i])
+	{
+		pcxt->known_attached_workers[i] = true;
+		pcxt->nknown_attached_workers++;
+	}

 	msgtype = pq_getmsgbyte(msg);

--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@ -312,7 +312,14 @@ gather_readnext(GatherState *gatherstate)
 		/* Check for async events, particularly messages from workers. */
 		CHECK_FOR_INTERRUPTS();

-		/* Attempt to read a tuple, but don't block if none is available. */
+		/*
+		 * Attempt to read a tuple, but don't block if none is available.
+		 *
+		 * Note that TupleQueueReaderNext will just return NULL for a worker
+		 * which fails to initialize.  We'll treat that worker as having
+		 * produced no tuples; WaitForParallelWorkersToFinish will error out
+		 * when we get there.
+		 */
 		Assert(gatherstate->nextreader < gatherstate->nreaders);
 		reader = gatherstate->reader[gatherstate->nextreader];
 		tup = TupleQueueReaderNext(reader, true, &readerdone);
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@ -710,7 +710,14 @@ gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
 	/* Check for async events, particularly messages from workers. */
 	CHECK_FOR_INTERRUPTS();

-	/* Attempt to read a tuple. */
+	/*
+	 * Attempt to read a tuple.
+	 *
+	 * Note that TupleQueueReaderNext will just return NULL for a worker which
+	 * fails to initialize.  We'll treat that worker as having produced no
+	 * tuples; WaitForParallelWorkersToFinish will error out when we get
+	 * there.
+	 */
 	reader = gm_state->reader[nreader - 1];
 	tup = TupleQueueReaderNext(reader, nowait, done);

--- a/src/include/access/parallel.h
+++ b/src/include/access/parallel.h
@ -43,7 +43,8 @@ typedef struct ParallelContext
 	void	   *private_memory;
 	shm_toc    *toc;
 	ParallelWorkerInfo *worker;
-	bool	   *any_message_received;
+	int			nknown_attached_workers;
+	bool	   *known_attached_workers;
 } ParallelContext;

 typedef struct ParallelWorkerContext
@ -62,6 +63,7 @@ extern ParallelContext *CreateParallelContext(const char *library_name, const ch
 extern void InitializeParallelDSM(ParallelContext *pcxt);
 extern void ReinitializeParallelDSM(ParallelContext *pcxt);
 extern void LaunchParallelWorkers(ParallelContext *pcxt);
+extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt);
 extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt);
 extern void DestroyParallelContext(ParallelContext *pcxt);
 extern bool ParallelContextActive(void);