From 54a16df0100da445be7c79eb81dfb96fd0685e6d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 22 Apr 2015 14:28:37 +0300
Subject: [PATCH] Make the pg_rewind regression tests more robust on slow
 systems.

There were a couple of hard-coded sleeps in the tests: to wait for standby
to catch up with master, and to wait for promotion with "pg_ctl promote"
to complete. Instead of a fixed, hard-coded sleep, poll the server with a
query once a second. This isn't ideal either, and I wish we had a better
solution for real-world applications too, but this should fix the
immediate problem.

Patch by Michael Paquier, with some editing by me.
---
 src/bin/pg_rewind/RewindTest.pm | 46 +++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 5 deletions(-)

diff --git a/src/bin/pg_rewind/RewindTest.pm b/src/bin/pg_rewind/RewindTest.pm
index 84e0512143..a5f7d08bf7 100644
--- a/src/bin/pg_rewind/RewindTest.pm
+++ b/src/bin/pg_rewind/RewindTest.pm
@@ -125,6 +125,37 @@ sub check_query
 	}
 }
 
+# Run a query once a second, until it returns 't' (i.e. SQL boolean true).
+sub poll_query_until
+{
+	my ($query, $connstr) = @_;
+
+	my $max_attempts = 30;
+	my $attempts = 0;
+	my ($stdout, $stderr);
+
+	while ($attempts < $max_attempts)
+	{
+		my $cmd = ['psql', '-At', '-c', "$query", '-d', "$connstr" ];
+		my $result = run $cmd, '>', \$stdout, '2>', \$stderr;
+
+		chomp($stdout);
+		if ($stdout eq "t")
+		{
+			return 1;
+		}
+
+		# Wait a second before retrying.
+		sleep 1;
+		$attempts++;
+	}
+
+	# The query result didn't change in 30 seconds. Give up. Print the stderr
+	# from the last attempt, hopefully that's useful for debugging.
+	diag $stderr;
+	return 0;
+}
+
 sub append_to_file
 {
 	my($filename, $str) = @_;
@@ -185,7 +216,7 @@ sub create_standby
 	# Base backup is taken with xlog files included
 	system_or_bail("pg_basebackup -D $test_standby_datadir -p $port_master -x >>$log_path 2>&1");
 	append_to_file("$test_standby_datadir/recovery.conf", qq(
-primary_conninfo='$connstr_master'
+primary_conninfo='$connstr_master application_name=rewind_standby'
 standby_mode=on
 recovery_target_timeline='latest'
 ));
@@ -193,8 +224,11 @@ recovery_target_timeline='latest'
 	# Start standby
 	system_or_bail("pg_ctl -w -D $test_standby_datadir -o \"-k $tempdir_short --listen-addresses='' -p $port_standby\" start >>$log_path 2>&1");
 
-	# sleep a bit to make sure the standby has caught up.
-	sleep 1;
+	# Wait until the standby has caught up with the primary, by polling
+	# pg_stat_replication.
+	my $caughtup_query = "SELECT pg_current_xlog_location() = replay_location FROM pg_stat_replication WHERE application_name = 'rewind_standby';";
+	poll_query_until($caughtup_query, $connstr_master)
+		or die "Timed out while waiting for standby to catch up";
 }
 
 sub promote_standby
@@ -203,9 +237,11 @@ sub promote_standby
 	# up standby
 
 	# Now promote slave and insert some new data on master, this will put
-	# the master out-of-sync with the standby.
+	# the master out-of-sync with the standby. Wait until the standby is
+	# out of recovery mode, and is ready to accept read-write connections.
 	system_or_bail("pg_ctl -w -D $test_standby_datadir promote >>$log_path 2>&1");
-	sleep 2;
+	poll_query_until("SELECT NOT pg_is_in_recovery()", $connstr_standby)
+		or die "Timed out while waiting for promotion of standby";
 }
 
 sub run_pg_rewind