From 551aa6b7b9f3b78e7b6cbab1fea70947fa7f4966 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 29 Sep 2022 13:27:48 -0400 Subject: [PATCH] Improve wording of log messages triggered by max_slot_wal_keep_size. The one about "terminating process to release replication slot" told you nothing about why that was happening. The one about "invalidating slot because its restart_lsn exceeds max_slot_wal_keep_size" told you what was happening, but violated our message style guideline about keeping the primary message short. Add DETAIL/HINT lines to carry the appropriate detail and make the two cases more uniform. While here, fix bogus test logic in 019_replslot_limit.pl: if it timed out without seeing the expected log message, no test failure would be reported. This is flat broken since commit 549ec201d removed the test counts; even before that it was horribly bad style, since you'd only get told that not all tests had been run. Kyotaro Horiguchi, reviewed by Bertrand Drouvot; test fixes by me Discussion: https://postgr.es/m/20211214.130456.2233153190058148084.horikyota.ntt@gmail.com --- src/backend/replication/slot.c | 17 ++++++++++++----- src/test/recovery/t/019_replslot_limit.pl | 15 +++++++++------ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 0bd0031188..e9328961dd 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -1293,8 +1293,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN, if (last_signaled_pid != active_pid) { ereport(LOG, - (errmsg("terminating process %d to release replication slot \"%s\"", - active_pid, NameStr(slotname)))); + errmsg("terminating process %d to release replication slot \"%s\"", + active_pid, NameStr(slotname)), + errdetail("The slot's restart_lsn %X/%X exceeds the limit by %llu bytes.", + LSN_FORMAT_ARGS(restart_lsn), + (unsigned long long) (oldestLSN - restart_lsn)), + errhint("You might need to increase max_slot_wal_keep_size.")); (void) kill(active_pid, SIGTERM); last_signaled_pid = active_pid; @@ -1331,9 +1335,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN, ReplicationSlotRelease(); ereport(LOG, - (errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size", - NameStr(slotname), - LSN_FORMAT_ARGS(restart_lsn)))); + errmsg("invalidating obsolete replication slot \"%s\"", + NameStr(slotname)), + errdetail("The slot's restart_lsn %X/%X exceeds the limit by %llu bytes.", + LSN_FORMAT_ARGS(restart_lsn), + (unsigned long long) (oldestLSN - restart_lsn)), + errhint("You might need to increase max_slot_wal_keep_size.")); /* done with this slot for now */ break; diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl index ce8d36b4c6..b8341fbb66 100644 --- a/src/test/recovery/t/019_replslot_limit.pl +++ b/src/test/recovery/t/019_replslot_limit.pl @@ -185,8 +185,7 @@ my $invalidated = 0; for (my $i = 0; $i < 10000; $i++) { if (find_in_log( - $node_primary, - "invalidating slot \"rep1\" because its restart_lsn [0-9A-F/]+ exceeds max_slot_wal_keep_size", + $node_primary, 'invalidating obsolete replication slot "rep1"', $logstart)) { $invalidated = 1; @@ -379,6 +378,7 @@ $logstart = get_log_size($node_primary3); kill 'STOP', $senderpid, $receiverpid; advance_wal($node_primary3, 2); +my $msg_logged = 0; my $max_attempts = $PostgreSQL::Test::Utils::timeout_default; while ($max_attempts-- >= 0) { @@ -387,11 +387,12 @@ while ($max_attempts-- >= 0) "terminating process $senderpid to release replication slot \"rep3\"", $logstart)) { - ok(1, "walsender termination logged"); + $msg_logged = 1; last; } sleep 1; } +ok($msg_logged, "walsender termination logged"); # Now let the walsender continue; slot should be killed now. # (Must not let walreceiver run yet; otherwise the standby could start another @@ -402,18 +403,20 @@ $node_primary3->poll_query_until('postgres', "lost") or die "timed out waiting for slot to be lost"; +$msg_logged = 0; $max_attempts = $PostgreSQL::Test::Utils::timeout_default; while ($max_attempts-- >= 0) { if (find_in_log( - $node_primary3, - 'invalidating slot "rep3" because its restart_lsn', $logstart)) + $node_primary3, 'invalidating obsolete replication slot "rep3"', + $logstart)) { - ok(1, "slot invalidation logged"); + $msg_logged = 1; last; } sleep 1; } +ok($msg_logged, "slot invalidation logged"); # Now let the walreceiver continue, so that the node can be stopped cleanly kill 'CONT', $receiverpid;