Clean up TAP tests' usage of wait_for_catchup().

By default, wait_for_catchup() waits for the replication connection to reach the primary's write LSN. That's fine, but in an apparent attempt to save one query round-trip, it was coded so that we executed pg_current_wal_lsn() again during each probe query. Thus, we presented the standby with a moving target to be reached. (While the test script itself couldn't be causing the write LSN to advance while it's blocked in wait_for_catchup(), it's plenty plausible that background activity such as autovacuum is emitting more WAL.) That could make the test take longer than necessary, and potentially it could mask bugs by allowing the standby to process more WAL than a strict interpretation of the test scenario allows. So, change wait_for_catchup() to do it "by the book", explicitly collecting the write LSN to wait for at the outset. Also, various call sites were instructing wait_for_catchup() to wait for the standby to reach the primary's insert LSN rather than its write LSN. This also seems like a bad idea. While in most test scenarios those are the same, if they are different then the inserted-but-not-yet-written WAL is not presently available to the standby. The test isn't doing anything to make it become so, so again we have the potential for unwanted test delay, perhaps even a test timeout. (Again, background activity would be needed to make this more than a hypothetical problem.) Hence, change the callers where necessary so that the wait target is always the primary's write LSN. While at it, simplify callers by making use of wait_for_catchup's default arguments wherever possible (the preceding change makes this possible in more places than it was before). And rewrite wait_for_catchup's documentation a bit. Patch by me; thanks to Julien Rouhaud for review. Discussion: https://postgr.es/m/2368336.1641843098@sss.pgh.pa.us
2022-01-16 13:29:02 -05:00 · 2022-01-16 13:29:02 -05:00 · ed48e3582e
commit ed48e3582e
parent 269b532aef
13 changed files with 59 additions and 84 deletions
--- a/src/bin/pg_basebackup/t/020_pg_receivewal.pl
+++ b/src/bin/pg_basebackup/t/020_pg_receivewal.pl
@ -290,7 +290,7 @@ $standby->psql(
 	"CREATE_REPLICATION_SLOT $archive_slot PHYSICAL (RESERVE_WAL)",
 	replication => 1);
 # Wait for standby catchup
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('write'));
+$primary->wait_for_catchup($standby);
 # Get a walfilename from before the promotion to make sure it is archived
 # after promotion
 my $standby_slot = $standby->slot($archive_slot);
--- a/src/bin/pg_rewind/t/007_standby_source.pl
+++ b/src/bin/pg_rewind/t/007_standby_source.pl
@ -74,7 +74,7 @@ $node_a->safe_psql('postgres',
 	"INSERT INTO tbl1 values ('in A, before promotion')");
 $node_a->safe_psql('postgres', 'CHECKPOINT');

-my $lsn = $node_a->lsn('insert');
+my $lsn = $node_a->lsn('write');
 $node_a->wait_for_catchup('node_b', 'write', $lsn);
 $node_b->wait_for_catchup('node_c', 'write', $lsn);

@ -93,8 +93,7 @@ $node_a->safe_psql('postgres',
 	"INSERT INTO tbl1 VALUES ('in A, after C was promoted')");

 # make sure it's replicated to B before we continue
-$lsn = $node_a->lsn('insert');
-$node_a->wait_for_catchup('node_b', 'replay', $lsn);
+$node_a->wait_for_catchup('node_b');

 # Also insert a new row in the standby, which won't be present in the
 # old primary.
@ -161,8 +160,7 @@ in A, after C was promoted
 $node_a->safe_psql('postgres',
 	"INSERT INTO tbl1 values ('in A, after rewind')");

-$lsn = $node_a->lsn('insert');
-$node_b->wait_for_catchup('node_c', 'replay', $lsn);
+$node_b->wait_for_catchup('node_c', 'replay', $node_a->lsn('write'));

 check_query(
 	'SELECT * FROM tbl1',
--- a/src/bin/pg_rewind/t/008_min_recovery_point.pl
+++ b/src/bin/pg_rewind/t/008_min_recovery_point.pl
@ -69,8 +69,7 @@ $node_3->init_from_backup($node_1, $backup_name, has_streaming => 1);
 $node_3->start;

 # Wait until node 3 has connected and caught up
-my $lsn = $node_1->lsn('insert');
-$node_1->wait_for_catchup('node_3', 'replay', $lsn);
+$node_1->wait_for_catchup('node_3');

 #
 # Swap the roles of node_1 and node_3, so that node_1 follows node_3.
@ -106,8 +105,7 @@ $node_2->restart();
 #

 # make sure node_1 is full caught up with node_3 first
-$lsn = $node_3->lsn('insert');
-$node_3->wait_for_catchup('node_1', 'replay', $lsn);
+$node_3->wait_for_catchup('node_1');

 $node_1->promote;
 # Force a checkpoint after promotion, like earlier.
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@ -2496,22 +2496,27 @@ sub lsn

 =item $node->wait_for_catchup(standby_name, mode, target_lsn)

-Wait for the node with application_name standby_name (usually from node->name,
-also works for logical subscriptions)
-until its replication location in pg_stat_replication equals or passes the
-upstream's WAL insert point at the time this function is called. By default
-the replay_lsn is waited for, but 'mode' may be specified to wait for any of
-sent|write|flush|replay. The connection catching up must be in a streaming
-state.
+Wait for the replication connection with application_name standby_name until
+its 'mode' replication column in pg_stat_replication equals or passes the
+specified or default target_lsn.  By default the replay_lsn is waited for,
+but 'mode' may be specified to wait for any of sent|write|flush|replay.
+The replication connection must be in a streaming state.
+
+When doing physical replication, the standby is usually identified by
+passing its PostgreSQL::Test::Cluster instance.  When doing logical
+replication, standby_name identifies a subscription.
+
+The default value of target_lsn is $node->lsn('write'), which ensures
+that the standby has caught up to what has been committed on the primary.
+If you pass an explicit value of target_lsn, it should almost always be
+the primary's write LSN; so this parameter is seldom needed except when
+querying some intermediate replication node rather than the primary.

 If there is no active replication connection from this peer, waits until
 poll_query_until timeout.

 Requires that the 'postgres' db exists and is accessible.

-target_lsn may be any arbitrary lsn, but is typically $primary_node->lsn('insert').
-If omitted, pg_current_wal_lsn() is used.
-
 This is not a test. It die()s on failure.

 =cut
@ -2531,23 +2536,18 @@ sub wait_for_catchup
 	{
 		$standby_name = $standby_name->name;
 	}
-	my $lsn_expr;
-	if (defined($target_lsn))
+	if (!defined($target_lsn))
 	{
-		$lsn_expr = "'$target_lsn'";
-	}
-	else
-	{
-		$lsn_expr = 'pg_current_wal_lsn()';
+		$target_lsn = $self->lsn('write');
 	}
 	print "Waiting for replication conn "
 	  . $standby_name . "'s "
 	  . $mode
 	  . "_lsn to pass "
-	  . $lsn_expr . " on "
+	  . $target_lsn . " on "
 	  . $self->name . "\n";
 	my $query =
-	  qq[SELECT $lsn_expr <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name = '$standby_name';];
+	  qq[SELECT '$target_lsn' <= ${mode}_lsn AND state = 'streaming' FROM pg_catalog.pg_stat_replication WHERE application_name = '$standby_name';];
 	$self->poll_query_until('postgres', $query)
 	  or croak "timed out waiting for catchup";
 	print "done\n";
--- a/src/test/recovery/t/001_stream_rep.pl
+++ b/src/test/recovery/t/001_stream_rep.pl
@ -47,10 +47,9 @@ $node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1002) AS a");

 # Wait for standbys to catch up
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-	$node_primary->lsn('insert'));
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-	$node_standby_1->lsn('replay'));
+my $primary_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);

 my $result =
  $node_standby_1->safe_psql('postgres', "SELECT count(*) FROM tab_int");
@ -67,10 +66,9 @@ $node_primary->safe_psql('postgres',
 	"CREATE SEQUENCE seq1; SELECT nextval('seq1')");

 # Wait for standbys to catch up
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-	$node_primary->lsn('insert'));
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-	$node_standby_1->lsn('replay'));
+$primary_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);

 $result = $node_standby_1->safe_psql('postgres', "SELECT * FROM seq1");
 print "standby 1: $result\n";
@ -374,10 +372,10 @@ sub replay_check
 	my $newval = $node_primary->safe_psql('postgres',
 		'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
 	);
-	$node_primary->wait_for_catchup($node_standby_1, 'replay',
-		$node_primary->lsn('insert'));
-	$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-		$node_standby_1->lsn('replay'));
+	my $primary_lsn = $node_primary->lsn('write');
+	$node_primary->wait_for_catchup($node_standby_1, 'replay', $primary_lsn);
+	$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $primary_lsn);
+
 	$node_standby_1->safe_psql('postgres',
 		qq[SELECT 1 FROM replayed WHERE val = $newval])
 	  or die "standby_1 didn't replay primary value $newval";
@ -481,8 +479,7 @@ $node_standby_1->stop;
 my $newval = $node_primary->safe_psql('postgres',
 	'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
 );
-$node_primary->wait_for_catchup($node_standby_2, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby_2);
 my $is_replayed = $node_standby_2->safe_psql('postgres',
 	qq[SELECT 1 FROM replayed WHERE val = $newval]);
 is($is_replayed, qq(1), "standby_2 didn't replay primary value $newval");
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@ -38,8 +38,7 @@ $node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");

 # Wait until standby has replayed enough data on standby 1
-$node_primary->wait_for_catchup($node_standby_1, 'replay',
-	$node_primary->lsn('write'));
+$node_primary->wait_for_catchup($node_standby_1);

 # Stop and remove primary
 $node_primary->teardown_node;
@ -64,8 +63,7 @@ $node_standby_2->restart;
 # to ensure that the timeline switch has been done.
 $node_standby_1->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(1001,2000))");
-$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
-	$node_standby_1->lsn('write'));
+$node_standby_1->wait_for_catchup($node_standby_2);

 my $result =
  $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
@ -103,8 +101,7 @@ $node_primary_2->promote;
 $node_standby_3->start;
 $node_primary_2->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT 1 AS a");
-$node_primary_2->wait_for_catchup($node_standby_3, 'replay',
-	$node_primary_2->lsn('write'));
+$node_primary_2->wait_for_catchup($node_standby_3);

 my $result_2 =
  $node_standby_3->safe_psql('postgres', "SELECT count(*) FROM tab_int");
--- a/src/test/recovery/t/010_logical_decoding_timelines.pl
+++ b/src/test/recovery/t/010_logical_decoding_timelines.pl
@ -88,8 +88,7 @@ $node_replica->start;
 # db and associated slot.
 is($node_primary->psql('postgres', 'DROP DATABASE dropme'),
 	0, 'dropped DB with logical slot OK on primary');
-$node_primary->wait_for_catchup($node_replica, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_replica);
 is( $node_replica->safe_psql(
 		'postgres', q[SELECT 1 FROM pg_database WHERE datname = 'dropme']),
 	'',
--- a/src/test/recovery/t/012_subtransactions.pl
+++ b/src/test/recovery/t/012_subtransactions.pl
@ -103,8 +103,7 @@ $node_primary->psql(
 	BEGIN;
 	SELECT hs_subxids(127);
 	COMMIT;");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
@ -150,8 +149,7 @@ $node_primary->psql(
 	BEGIN;
 	SELECT hs_subxids(127);
 	PREPARE TRANSACTION 'xact_012_1';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
@ -187,8 +185,7 @@ $node_primary->psql(
 	BEGIN;
 	SELECT hs_subxids(201);
 	PREPARE TRANSACTION 'xact_012_1';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
--- a/src/test/recovery/t/015_promotion_pages.pl
+++ b/src/test/recovery/t/015_promotion_pages.pl
@ -44,7 +44,7 @@ $alpha->safe_psql('postgres', 'checkpoint');
 # problematic WAL records.
 $alpha->safe_psql('postgres', 'vacuum verbose test1');
 # Wait for last record to have been replayed on the standby.
-$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert'));
+$alpha->wait_for_catchup($bravo);

 # Now force a checkpoint on the standby. This seems unnecessary but for "some"
 # reason, the previous checkpoint on the primary does not reflect on the standby
@ -60,7 +60,7 @@ $alpha->safe_psql('postgres',
 $alpha->safe_psql('postgres', 'truncate test2');

 # Wait again for all records to be replayed.
-$alpha->wait_for_catchup($bravo, 'replay', $alpha->lsn('insert'));
+$alpha->wait_for_catchup($bravo);

 # Do the promotion, which reinitializes minRecoveryPoint in the control
 # file so as WAL is replayed up to the end.
--- a/src/test/recovery/t/016_min_consistency.pl
+++ b/src/test/recovery/t/016_min_consistency.pl
@ -78,7 +78,7 @@ $primary->safe_psql('postgres', 'CHECKPOINT;');
 $primary->safe_psql('postgres', 'UPDATE test1 SET a = a + 1;');

 # Wait for last record to have been replayed on the standby.
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert'));
+$primary->wait_for_catchup($standby);

 # Fill in the standby's shared buffers with the data filled in
 # previously.
@ -99,7 +99,7 @@ my $relfilenode = $primary->safe_psql('postgres',
 	"SELECT pg_relation_filepath('test1'::regclass);");

 # Wait for last record to have been replayed on the standby.
-$primary->wait_for_catchup($standby, 'replay', $primary->lsn('insert'));
+$primary->wait_for_catchup($standby);

 # Issue a restart point on the standby now, which makes the checkpointer
 # update minRecoveryPoint.
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@ -49,8 +49,7 @@ $node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'");
 $node_standby->start;

 # Wait until standby has replayed enough data
-my $start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 # Stop standby
 $node_standby->stop;
@ -84,8 +83,7 @@ is($result, "reserved|t", 'check that slot is working');
 # The standby can reconnect to primary
 $node_standby->start;

-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 $node_standby->stop;

@ -115,8 +113,7 @@ is($result, "reserved",

 # The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->stop;

 # wal_keep_size overrides max_slot_wal_keep_size
@ -135,8 +132,7 @@ $result = $node_primary->safe_psql('postgres',

 # The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);
 $node_standby->stop;

 # Advance WAL again without checkpoint, reducing remain by 6 MB.
@ -163,8 +159,7 @@ is($result, "unreserved|t",
 # The standby still can connect to primary before a checkpoint
 $node_standby->start;

-$start_lsn = $node_primary->lsn('write');
-$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$node_primary->wait_for_catchup($node_standby);

 $node_standby->stop;

@ -334,7 +329,7 @@ $node_standby3->init_from_backup($node_primary3, $backup_name,
 	has_streaming => 1);
 $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
 $node_standby3->start;
-$node_primary3->wait_for_catchup($node_standby3->name, 'replay');
+$node_primary3->wait_for_catchup($node_standby3);
 my $senderpid = $node_primary3->safe_psql('postgres',
 	"SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'");
 like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid");
--- a/src/test/recovery/t/021_row_visibility.pl
+++ b/src/test/recovery/t/021_row_visibility.pl
@ -76,8 +76,7 @@ ok( send_query_and_wait(
 #
 $node_primary->psql('postgres',
 	"INSERT INTO test_visibility VALUES ('first insert')");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
 		\%psql_standby,
@ -98,8 +97,7 @@ UPDATE test_visibility SET data = 'first update' RETURNING data;
 	'UPDATE');

 $node_primary->psql('postgres', "SELECT txid_current();");  # ensure WAL flush
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
 		\%psql_standby,
@ -112,8 +110,7 @@ ok( send_query_and_wait(
 #
 ok(send_query_and_wait(\%psql_primary, q[COMMIT;], qr/^COMMIT$/m), 'COMMIT');

-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
 		\%psql_standby,
@ -142,8 +139,7 @@ PREPARE TRANSACTION 'will_abort';
 		qr/^PREPARE TRANSACTION$/m),
 	'prepared will_abort');

-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
 		\%psql_standby,
@ -154,8 +150,7 @@ ok( send_query_and_wait(
 # For some variation, finish prepared xacts via separate connections
 $node_primary->safe_psql('postgres', "COMMIT PREPARED 'will_commit';");
 $node_primary->safe_psql('postgres', "ROLLBACK PREPARED 'will_abort';");
-$node_primary->wait_for_catchup($node_standby, 'replay',
-	$node_primary->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby);

 ok( send_query_and_wait(
 		\%psql_standby,
--- a/src/test/recovery/t/025_stuck_on_old_timeline.pl
+++ b/src/test/recovery/t/025_stuck_on_old_timeline.pl
@ -101,8 +101,7 @@ $node_cascade->start;
 $node_standby->safe_psql('postgres', "CREATE TABLE tab_int AS SELECT 1 AS a");

 # Wait for the replication to catch up
-$node_standby->wait_for_catchup($node_cascade, 'replay',
-	$node_standby->lsn('insert'));
+$node_standby->wait_for_catchup($node_cascade);

 # Check that cascading standby has the new content
 my $result =