From 065583cf460f980a182498941ac52810f709a897 Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 25 Jun 2024 15:42:36 -0400 Subject: [PATCH] Prevent summarizer hang when summarize_wal turned off and back on. Before this commit, when the WAL summarizer started up or recovered from an error, it would resume summarization from wherever it left off. That was OK normally, but wrong if summarize_wal=off had been turned off temporary, allowing some WAL to be removed, and then turned back on again. In such cases, the WAL summarizer would simply hang forever. This commit changes the reinitialization sequence for WAL summarizer to rederive the starting position in the way we were already doing at initial startup, fixing the problem. Per report from Israel Barth Rubio. Reviewed by Tom Lane. Discussion: http://postgr.es/m/CA+TgmoYN6x=YS+FoFOS6=nr6=qkXZFWhdiL7k0oatGwug2hcuA@mail.gmail.com --- src/backend/access/transam/xlog.c | 2 +- src/backend/postmaster/walsummarizer.c | 83 +++++++++++++------------- src/include/postmaster/walsummarizer.h | 3 +- 3 files changed, 45 insertions(+), 43 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a69337f2d4..d36272ab4f 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7916,7 +7916,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) * If WAL summarization is in use, don't remove WAL that has yet to be * summarized. */ - keep = GetOldestUnsummarizedLSN(NULL, NULL, false); + keep = GetOldestUnsummarizedLSN(NULL, NULL); if (keep != InvalidXLogRecPtr) { XLogSegNo unsummarized_segno; diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c index 72f6c04478..7f987af40d 100644 --- a/src/backend/postmaster/walsummarizer.c +++ b/src/backend/postmaster/walsummarizer.c @@ -337,7 +337,7 @@ WalSummarizerMain(char *startup_data, size_t startup_data_len) * * If we discover that WAL summarization is not enabled, just exit. */ - current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact, true); + current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact); if (XLogRecPtrIsInvalid(current_lsn)) proc_exit(0); @@ -479,23 +479,18 @@ GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn, /* * Get the oldest LSN in this server's timeline history that has not yet been - * summarized. + * summarized, and update shared memory state as appropriate. * * If *tli != NULL, it will be set to the TLI for the LSN that is returned. * * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is * necessarily the start of a WAL record and false if it's just the beginning * of a WAL segment. - * - * If reset_pending_lsn is true, resets the pending_lsn in shared memory to - * be equal to the summarized_lsn. */ XLogRecPtr -GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, - bool reset_pending_lsn) +GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact) { TimeLineID latest_tli; - LWLockMode mode = reset_pending_lsn ? LW_EXCLUSIVE : LW_SHARED; int n; List *tles; XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr; @@ -503,22 +498,21 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, bool should_make_exact = false; List *existing_summaries; ListCell *lc; + bool am_wal_summarizer = AmWalSummarizerProcess(); /* If not summarizing WAL, do nothing. */ if (!summarize_wal) return InvalidXLogRecPtr; /* - * Unless we need to reset the pending_lsn, we initially acquire the lock - * in shared mode and try to fetch the required information. If we acquire - * in shared mode and find that the data structure hasn't been - * initialized, we reacquire the lock in exclusive mode so that we can - * initialize it. However, if someone else does that first before we get - * the lock, then we can just return the requested information after all. + * If we are not the WAL summarizer process, then we normally just want + * to read the values from shared memory. However, as an exception, if + * shared memory hasn't been initialized yet, then we need to do that so + * that we can read legal values and not remove any WAL too early. */ - while (1) + if (!am_wal_summarizer) { - LWLockAcquire(WALSummarizerLock, mode); + LWLockAcquire(WALSummarizerLock, LW_SHARED); if (WalSummarizerCtl->initialized) { @@ -527,27 +521,22 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, *tli = WalSummarizerCtl->summarized_tli; if (lsn_is_exact != NULL) *lsn_is_exact = WalSummarizerCtl->lsn_is_exact; - if (reset_pending_lsn) - WalSummarizerCtl->pending_lsn = - WalSummarizerCtl->summarized_lsn; LWLockRelease(WALSummarizerLock); return unsummarized_lsn; } - if (mode == LW_EXCLUSIVE) - break; - LWLockRelease(WALSummarizerLock); - mode = LW_EXCLUSIVE; } /* - * The data structure needs to be initialized, and we are the first to - * obtain the lock in exclusive mode, so it's our job to do that - * initialization. + * Find the oldest timeline on which WAL still exists, and the earliest + * segment for which it exists. * - * So, find the oldest timeline on which WAL still exists, and the - * earliest segment for which it exists. + * Note that we do this every time the WAL summarizer process restarts + * or recovers from an error, in case the contents of pg_wal have changed + * under us e.g. if some files were removed, either manually - which + * shouldn't really happen, but might - or by postgres itself, if + * summarize_wal was turned off and then back on again. */ (void) GetLatestLSN(&latest_tli); tles = readTimeLineHistory(latest_tli); @@ -568,12 +557,6 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, } } - /* It really should not be possible for us to find no WAL. */ - if (unsummarized_tli == 0) - ereport(ERROR, - errcode(ERRCODE_INTERNAL_ERROR), - errmsg_internal("no WAL found on timeline %u", latest_tli)); - /* * Don't try to summarize anything older than the end LSN of the newest * summary file that exists for this timeline. @@ -592,12 +575,32 @@ GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact, } } - /* Update shared memory with the discovered values. */ - WalSummarizerCtl->initialized = true; - WalSummarizerCtl->summarized_lsn = unsummarized_lsn; - WalSummarizerCtl->summarized_tli = unsummarized_tli; - WalSummarizerCtl->lsn_is_exact = should_make_exact; - WalSummarizerCtl->pending_lsn = unsummarized_lsn; + /* It really should not be possible for us to find no WAL. */ + if (unsummarized_tli == 0) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("no WAL found on timeline %u", latest_tli)); + + /* + * If we're the WAL summarizer, we always want to store the values we + * just computed into shared memory, because those are the values we're + * going to use to drive our operation, and so they are the authoritative + * values. Otherwise, we only store values into shared memory if shared + * memory is uninitialized. Our values are not canonical in such a case, + * but it's better to have something than nothing, to guide WAL + * retention. + */ + LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); + if (am_wal_summarizer|| !WalSummarizerCtl->initialized) + { + WalSummarizerCtl->initialized = true; + WalSummarizerCtl->summarized_lsn = unsummarized_lsn; + WalSummarizerCtl->summarized_tli = unsummarized_tli; + WalSummarizerCtl->lsn_is_exact = should_make_exact; + WalSummarizerCtl->pending_lsn = unsummarized_lsn; + } + else + unsummarized_lsn = WalSummarizerCtl->summarized_lsn; /* Also return the to the caller as required. */ if (tli != NULL) diff --git a/src/include/postmaster/walsummarizer.h b/src/include/postmaster/walsummarizer.h index ad346d0c11..112bc1e6cb 100644 --- a/src/include/postmaster/walsummarizer.h +++ b/src/include/postmaster/walsummarizer.h @@ -28,8 +28,7 @@ extern void GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *pending_lsn, int *summarizer_pid); extern XLogRecPtr GetOldestUnsummarizedLSN(TimeLineID *tli, - bool *lsn_is_exact, - bool reset_pending_lsn); + bool *lsn_is_exact); extern void SetWalSummarizerLatch(void); extern XLogRecPtr WaitForWalSummarization(XLogRecPtr lsn, long timeout, XLogRecPtr *pending_lsn);