Tighten TAP tests' tracking of postmaster state some more.
Commits 6c4a8903b et al. had a couple of deficiencies: * The logic I added to Cluster::start to see if a PID file is present could be fooled by a stale PID file left over from a previous postmaster. To fix, if we're not sure whether we expect to find a running postmaster or not, validate the PID using "kill 0". * 017_shm.pl has a loop in which it just issues repeated Cluster::start calls; this will fail if some invocation fails but leaves self->_pid set. Per buildfarm results, the above fix is not enough to make this safe: we might have "validated" a PID for a postmaster that exits immediately after we look. Hence, match each failed start call with a stop call that will get us back to the self->_pid == undef state. Add a fail_ok option to Cluster::stop to make this work. Discussion: https://postgr.es/m/CA+hUKGKV6fOHvfiPt8=dOKzvswjAyLoFoJF1iQXMNpi7+hD1JQ@mail.gmail.com
This commit is contained in:
parent
31680730ef
commit
4828a80069
@ -831,20 +831,37 @@ Note: if the node is already known stopped, this does nothing.
|
|||||||
However, if we think it's running and it's not, it's important for
|
However, if we think it's running and it's not, it's important for
|
||||||
this to fail. Otherwise, tests might fail to detect server crashes.
|
this to fail. Otherwise, tests might fail to detect server crashes.
|
||||||
|
|
||||||
|
With optional extra param fail_ok => 1, returns 0 for failure
|
||||||
|
instead of bailing out.
|
||||||
|
|
||||||
=cut
|
=cut
|
||||||
|
|
||||||
sub stop
|
sub stop
|
||||||
{
|
{
|
||||||
my ($self, $mode) = @_;
|
my ($self, $mode, %params) = @_;
|
||||||
my $port = $self->port;
|
|
||||||
my $pgdata = $self->data_dir;
|
my $pgdata = $self->data_dir;
|
||||||
my $name = $self->name;
|
my $name = $self->name;
|
||||||
|
my $ret;
|
||||||
$mode = 'fast' unless defined $mode;
|
$mode = 'fast' unless defined $mode;
|
||||||
return unless defined $self->{_pid};
|
return 1 unless defined $self->{_pid};
|
||||||
|
|
||||||
print "### Stopping node \"$name\" using mode $mode\n";
|
print "### Stopping node \"$name\" using mode $mode\n";
|
||||||
TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-m', $mode, 'stop');
|
$ret = TestLib::system_log('pg_ctl', '-D', $pgdata,
|
||||||
|
'-m', $mode, 'stop');
|
||||||
|
|
||||||
|
if ($ret != 0)
|
||||||
|
{
|
||||||
|
print "# pg_ctl stop failed: $ret\n";
|
||||||
|
|
||||||
|
# Check to see if we still have a postmaster or not.
|
||||||
|
$self->_update_pid(-1);
|
||||||
|
|
||||||
|
BAIL_OUT("pg_ctl stop failed") unless $params{fail_ok};
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
$self->_update_pid(0);
|
$self->_update_pid(0);
|
||||||
return;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
=pod
|
=pod
|
||||||
@ -1066,9 +1083,20 @@ sub _update_pid
|
|||||||
if (open my $pidfile, '<', $self->data_dir . "/postmaster.pid")
|
if (open my $pidfile, '<', $self->data_dir . "/postmaster.pid")
|
||||||
{
|
{
|
||||||
chomp($self->{_pid} = <$pidfile>);
|
chomp($self->{_pid} = <$pidfile>);
|
||||||
print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
|
|
||||||
close $pidfile;
|
close $pidfile;
|
||||||
|
|
||||||
|
# If we aren't sure what to expect, validate the PID using kill().
|
||||||
|
# This protects against stale PID files left by crashed postmasters.
|
||||||
|
if ($is_running == -1 && kill(0, $self->{_pid}) == 0)
|
||||||
|
{
|
||||||
|
print
|
||||||
|
"# Stale postmaster.pid file for node \"$name\": PID $self->{_pid} no longer exists\n";
|
||||||
|
$self->{_pid} = undef;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
print "# Postmaster PID for node \"$name\" is $self->{_pid}\n";
|
||||||
|
|
||||||
# If we found a pidfile when there shouldn't be one, complain.
|
# If we found a pidfile when there shouldn't be one, complain.
|
||||||
BAIL_OUT("postmaster.pid unexpectedly present") if $is_running == 0;
|
BAIL_OUT("postmaster.pid unexpectedly present") if $is_running == 0;
|
||||||
return;
|
return;
|
||||||
|
@ -204,6 +204,9 @@ sub poll_start
|
|||||||
# Wait 0.1 second before retrying.
|
# Wait 0.1 second before retrying.
|
||||||
usleep(100_000);
|
usleep(100_000);
|
||||||
|
|
||||||
|
# Clean up in case the start attempt just timed out or some such.
|
||||||
|
$node->stop('fast', fail_ok => 1);
|
||||||
|
|
||||||
$attempts++;
|
$attempts++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user