Prevent summarizer hang when summarize_wal turned off and back on.

Before this commit, when the WAL summarizer started up or recovered
from an error, it would resume summarization from wherever it left
off. That was OK normally, but wrong if summarize_wal=off had been
turned off temporary, allowing some WAL to be removed, and then turned
back on again. In such cases, the WAL summarizer would simply hang
forever. This commit changes the reinitialization sequence for WAL
summarizer to rederive the starting position in the way we were
already doing at initial startup, fixing the problem.

Per report from Israel Barth Rubio. Reviewed by Tom Lane.

Discussion: http://postgr.es/m/CA+TgmoYN6x=YS+FoFOS6=nr6=qkXZFWhdiL7k0oatGwug2hcuA@mail.gmail.com
This commit is contained in:
Robert Haas 2024-06-25 15:42:36 -04:00
parent 55e56c84da
commit 065583cf46
3 changed files with 45 additions and 43 deletions

View File

@ -7916,7 +7916,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
* If WAL summarization is in use, don't remove WAL that has yet to be * If WAL summarization is in use, don't remove WAL that has yet to be
* summarized. * summarized.
*/ */
keep = GetOldestUnsummarizedLSN(NULL, NULL, false); keep = GetOldestUnsummarizedLSN(NULL, NULL);
if (keep != InvalidXLogRecPtr) if (keep != InvalidXLogRecPtr)
{ {
XLogSegNo unsummarized_segno; XLogSegNo unsummarized_segno;

View File

@ -337,7 +337,7 @@ WalSummarizerMain(char *startup_data, size_t startup_data_len)
* *
* If we discover that WAL summarization is not enabled, just exit. * If we discover that WAL summarization is not enabled, just exit.
*/ */
current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact, true); current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact);
if (XLogRecPtrIsInvalid(current_lsn)) if (XLogRecPtrIsInvalid(current_lsn))
proc_exit(0); proc_exit(0);
@ -479,23 +479,18 @@ GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
/* /*
* Get the oldest LSN in this server's timeline history that has not yet been * Get the oldest LSN in this server's timeline history that has not yet been
* summarized. * summarized, and update shared memory state as appropriate.
* *
* If *tli != NULL, it will be set to the TLI for the LSN that is returned. * If *tli != NULL, it will be set to the TLI for the LSN that is returned.
* *
* If *lsn_is_exact != NULL, it will be set to true if the returned LSN is * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
* necessarily the start of a WAL record and false if it's just the beginning * necessarily the start of a WAL record and false if it's just the beginning
* of a WAL segment. * of a WAL segment.
*
* If reset_pending_lsn is true, resets the pending_lsn in shared memory to
* be equal to the summarized_lsn.
*/ */
XLogRecPtr XLogRecPtr
GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
bool reset_pending_lsn)
{ {
TimeLineID latest_tli; TimeLineID latest_tli;
LWLockMode mode = reset_pending_lsn ? LW_EXCLUSIVE : LW_SHARED;
int n; int n;
List *tles; List *tles;
XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr; XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr;
@ -503,22 +498,21 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
bool should_make_exact = false; bool should_make_exact = false;
List *existing_summaries; List *existing_summaries;
ListCell *lc; ListCell *lc;
bool am_wal_summarizer = AmWalSummarizerProcess();
/* If not summarizing WAL, do nothing. */ /* If not summarizing WAL, do nothing. */
if (!summarize_wal) if (!summarize_wal)
return InvalidXLogRecPtr; return InvalidXLogRecPtr;
/* /*
* Unless we need to reset the pending_lsn, we initially acquire the lock * If we are not the WAL summarizer process, then we normally just want
* in shared mode and try to fetch the required information. If we acquire * to read the values from shared memory. However, as an exception, if
* in shared mode and find that the data structure hasn't been * shared memory hasn't been initialized yet, then we need to do that so
* initialized, we reacquire the lock in exclusive mode so that we can * that we can read legal values and not remove any WAL too early.
* initialize it. However, if someone else does that first before we get
* the lock, then we can just return the requested information after all.
*/ */
while (1) if (!am_wal_summarizer)
{ {
LWLockAcquire(WALSummarizerLock, mode); LWLockAcquire(WALSummarizerLock, LW_SHARED);
if (WalSummarizerCtl->initialized) if (WalSummarizerCtl->initialized)
{ {
@ -527,27 +521,22 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
*tli = WalSummarizerCtl->summarized_tli; *tli = WalSummarizerCtl->summarized_tli;
if (lsn_is_exact != NULL) if (lsn_is_exact != NULL)
*lsn_is_exact = WalSummarizerCtl->lsn_is_exact; *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
if (reset_pending_lsn)
WalSummarizerCtl->pending_lsn =
WalSummarizerCtl->summarized_lsn;
LWLockRelease(WALSummarizerLock); LWLockRelease(WALSummarizerLock);
return unsummarized_lsn; return unsummarized_lsn;
} }
if (mode == LW_EXCLUSIVE)
break;
LWLockRelease(WALSummarizerLock); LWLockRelease(WALSummarizerLock);
mode = LW_EXCLUSIVE;
} }
/* /*
* The data structure needs to be initialized, and we are the first to * Find the oldest timeline on which WAL still exists, and the earliest
* obtain the lock in exclusive mode, so it's our job to do that * segment for which it exists.
* initialization.
* *
* So, find the oldest timeline on which WAL still exists, and the * Note that we do this every time the WAL summarizer process restarts
* earliest segment for which it exists. * or recovers from an error, in case the contents of pg_wal have changed
* under us e.g. if some files were removed, either manually - which
* shouldn't really happen, but might - or by postgres itself, if
* summarize_wal was turned off and then back on again.
*/ */
(void) GetLatestLSN(&latest_tli); (void) GetLatestLSN(&latest_tli);
tles = readTimeLineHistory(latest_tli); tles = readTimeLineHistory(latest_tli);
@ -568,12 +557,6 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
} }
} }
/* It really should not be possible for us to find no WAL. */
if (unsummarized_tli == 0)
ereport(ERROR,
errcode(ERRCODE_INTERNAL_ERROR),
errmsg_internal("no WAL found on timeline %u", latest_tli));
/* /*
* Don't try to summarize anything older than the end LSN of the newest * Don't try to summarize anything older than the end LSN of the newest
* summary file that exists for this timeline. * summary file that exists for this timeline.
@ -592,12 +575,32 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact,
} }
} }
/* Update shared memory with the discovered values. */ /* It really should not be possible for us to find no WAL. */
WalSummarizerCtl->initialized = true; if (unsummarized_tli == 0)
WalSummarizerCtl->summarized_lsn = unsummarized_lsn; ereport(ERROR,
WalSummarizerCtl->summarized_tli = unsummarized_tli; errcode(ERRCODE_INTERNAL_ERROR),
WalSummarizerCtl->lsn_is_exact = should_make_exact; errmsg_internal("no WAL found on timeline %u", latest_tli));
WalSummarizerCtl->pending_lsn = unsummarized_lsn;
/*
* If we're the WAL summarizer, we always want to store the values we
* just computed into shared memory, because those are the values we're
* going to use to drive our operation, and so they are the authoritative
* values. Otherwise, we only store values into shared memory if shared
* memory is uninitialized. Our values are not canonical in such a case,
* but it's better to have something than nothing, to guide WAL
* retention.
*/
LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
if (am_wal_summarizer|| !WalSummarizerCtl->initialized)
{
WalSummarizerCtl->initialized = true;
WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
WalSummarizerCtl->summarized_tli = unsummarized_tli;
WalSummarizerCtl->lsn_is_exact = should_make_exact;
WalSummarizerCtl->pending_lsn = unsummarized_lsn;
}
else
unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
/* Also return the to the caller as required. */ /* Also return the to the caller as required. */
if (tli != NULL) if (tli != NULL)

View File

@ -28,8 +28,7 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli,
XLogRecPtr *pending_lsn, XLogRecPtr *pending_lsn,
int *summarizer_pid); int *summarizer_pid);
extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli,
bool *lsn_is_exact, bool *lsn_is_exact);
bool reset_pending_lsn);
extern void SetWalSummarizerLatch(void); extern void SetWalSummarizerLatch(void);
extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout, extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout,
XLogRecPtr *pending_lsn); XLogRecPtr *pending_lsn);