Tighten TAP tests' tracking of postmaster state some more.
Commits 6c4a8903b et al. had a couple of deficiencies: * The logic I added to Cluster::start to see if a PID file is present could be fooled by a stale PID file left over from a previous postmaster. To fix, if we're not sure whether we expect to find a running postmaster or not, validate the PID using "kill 0". * 017_shm.pl has a loop in which it just issues repeated Cluster::start calls; this will fail if some invocation fails but leaves self->_pid set. Per buildfarm results, the above fix is not enough to make this safe: we might have "validated" a PID for a postmaster that exits immediately after we look. Hence, match each failed start call with a stop call that will get us back to the self->_pid == undef state. Add a fail_ok option to Cluster::stop to make this work. Discussion: https://postgr.es/m/CA+hUKGKV6fOHvfiPt8=dOKzvswjAyLoFoJF1iQXMNpi7+hD1JQ@mail.gmail.com
This commit is contained in:
parent
3500ccc39b
commit
4fdbf9af51
@ -896,23 +896,40 @@ Note: if the node is already known stopped, this does nothing.
|
||||
However, if we think it's running and it's not, it's important for
|
||||
this to fail. Otherwise, tests might fail to detect server crashes.
|
||||
|
||||
With optional extra param fail_ok => 1, returns 0 for failure
|
||||
instead of bailing out.
|
||||
|
||||
=cut
|
||||
|
||||
sub stop
|
||||
{
|
||||
my ($self, $mode) = @_;
|
||||
my $port = $self->port;
|
||||
my ($self, $mode, %params) = @_;
|
||||
my $pgdata = $self->data_dir;
|
||||
my $name = $self->name;
|
||||
my $ret;
|
||||
|
||||
local %ENV = $self->_get_env();
|
||||
|
||||
$mode = 'fast' unless defined $mode;
|
||||
return unless defined $self->{_pid};
|
||||
return 1 unless defined $self->{_pid};
|
||||
|
||||
print "### Stopping node \"$name\" using mode $mode\n";
|
||||
PostgreSQL::Test::Utils::system_or_bail('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
|
||||
$ret = PostgreSQL::Test::Utils::system_log('pg_ctl', '-D', $pgdata,
|
||||
'-m', $mode, 'stop');
|
||||
|
||||
if ($ret != 0)
|
||||
{
|
||||
print "# pg_ctl stop failed: $ret\n";
|
||||
|
||||
# Check to see if we still have a postmaster or not.
|
||||
$self->_update_pid(-1);
|
||||
|
||||
BAIL_OUT("pg_ctl stop failed") unless $params{fail_ok};
|
||||
return 0;
|
||||
}
|
||||
|
||||
$self->_update_pid(0);
|
||||
return;
|
||||
return 1;
|
||||
}
|
||||
|
||||
=pod
|
||||
@ -1142,9 +1159,20 @@ sub _update_pid
|
||||
if (open my $pidfile, '<', $self->data_dir . "/postmaster.pid")
|
||||
{
|
||||
chomp($self->{_pid} = <$pidfile>);
|
||||
print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
|
||||
close $pidfile;
|
||||
|
||||
# If we aren't sure what to expect, validate the PID using kill().
|
||||
# This protects against stale PID files left by crashed postmasters.
|
||||
if ($is_running == -1 && kill(0, $self->{_pid}) == 0)
|
||||
{
|
||||
print
|
||||
"# Stale postmaster.pid file for node \"$name\": PID $self->{_pid} no longer exists\n";
|
||||
$self->{_pid} = undef;
|
||||
return;
|
||||
}
|
||||
|
||||
print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
|
||||
|
||||
# If we found a pidfile when there shouldn't be one, complain.
|
||||
BAIL_OUT("postmaster.pid unexpectedly present") if $is_running == 0;
|
||||
return;
|
||||
|
@ -207,6 +207,9 @@ sub poll_start
|
||||
# Wait 0.1 second before retrying.
|
||||
usleep(100_000);
|
||||
|
||||
# Clean up in case the start attempt just timed out or some such.
|
||||
$node->stop('fast', fail_ok => 1);
|
||||
|
||||
$attempts++;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user