mirror of https://github.com/postgres/postgres
Add regression test for restart points during promotion
This test serves as a way to demonstrate how to use the features introduced in37b369dc67
, while providing coverage for7863ee4def
that caused the startup process to throw "PANIC: could not locate a valid checkpoint record" when starting recovery. The test checks that a node is able to properly restart following a crash when a restart point was finishing across a promotion, with an injection point added in the middle of CreateRestartPoint() to stop the restartpoint in flight. Note that this test fails when7863ee4def
is reverted. Kyotaro Horiguchi is the original author of this test, that has been originally posted on the thread where7863ee4def
was discussed. I have just upgraded and polished it to rely on injection points, making it much cheaper to reproduce the failure. This test requires injection points to be enabled in the builds, hence meson and ./configure need an update to pass this knowledge down to the test. The name of the new injection point follows the same naming convention as6a1ea02c49
. The Makefile's EXTRA_INSTALL of recovery TAP tests is updated to include modules/injection_points. Author: Kyotaro Horiguchi, Michael Paquier Reviewed-by: Andrey Borodin, Bertrand Drouvot Discussion: https://postgr.es/m/ZdLuxBk5hGpol91B@paquier.xyz
This commit is contained in:
parent
37b369dc67
commit
6782709df8
|
@ -100,6 +100,7 @@
|
|||
#include "storage/sync.h"
|
||||
#include "utils/guc_hooks.h"
|
||||
#include "utils/guc_tables.h"
|
||||
#include "utils/injection_point.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/ps_status.h"
|
||||
#include "utils/relmapper.h"
|
||||
|
@ -7528,6 +7529,12 @@ CreateRestartPoint(int flags)
|
|||
|
||||
CheckPointGuts(lastCheckPoint.redo, flags);
|
||||
|
||||
/*
|
||||
* This location needs to be after CheckPointGuts() to ensure that some
|
||||
* work has already happened during this checkpoint.
|
||||
*/
|
||||
INJECTION_POINT("create-restart-point");
|
||||
|
||||
/*
|
||||
* Remember the prior checkpoint's redo ptr for
|
||||
* UpdateCheckPointDistanceEstimate()
|
||||
|
|
|
@ -9,12 +9,17 @@
|
|||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
EXTRA_INSTALL=contrib/pg_prewarm contrib/pg_stat_statements contrib/test_decoding
|
||||
EXTRA_INSTALL=contrib/pg_prewarm \
|
||||
contrib/pg_stat_statements \
|
||||
contrib/test_decoding \
|
||||
src/test/modules/injection_points
|
||||
|
||||
subdir = src/test/recovery
|
||||
top_builddir = ../../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
|
||||
export enable_injection_points enable_injection_points
|
||||
|
||||
# required for 017_shm.pl and 027_stream_regress.pl
|
||||
REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX)
|
||||
export REGRESS_SHLIB
|
||||
|
|
|
@ -6,6 +6,9 @@ tests += {
|
|||
'bd': meson.current_build_dir(),
|
||||
'tap': {
|
||||
'test_kwargs': {'priority': 40}, # recovery tests are slow, start early
|
||||
'env': {
|
||||
'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
|
||||
},
|
||||
'tests': [
|
||||
't/001_stream_rep.pl',
|
||||
't/002_archiving.pl',
|
||||
|
@ -46,6 +49,7 @@ tests += {
|
|||
't/038_save_logical_slots_shutdown.pl',
|
||||
't/039_end_of_wal.pl',
|
||||
't/040_standby_failover_slots_sync.pl',
|
||||
't/041_checkpoint_at_promote.pl',
|
||||
],
|
||||
},
|
||||
}
|
||||
|
|
|
@ -0,0 +1,170 @@
|
|||
|
||||
# Copyright (c) 2024, PostgreSQL Global Development Group
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use PostgreSQL::Test::Cluster;
|
||||
use PostgreSQL::Test::Utils;
|
||||
use Time::HiRes qw(usleep);
|
||||
use Test::More;
|
||||
|
||||
##################################################
|
||||
# Test race condition when a restart point is running during a promotion,
|
||||
# checking that WAL segments are correctly removed in the restart point
|
||||
# while the promotion finishes.
|
||||
#
|
||||
# This test relies on an injection point that causes the checkpointer to
|
||||
# wait in the middle of a restart point on a standby. The checkpointer
|
||||
# is awaken to finish its restart point only once the promotion of the
|
||||
# standby is completed, and the node should be able to restart properly.
|
||||
##################################################
|
||||
|
||||
if ($ENV{enable_injection_points} ne 'yes')
|
||||
{
|
||||
plan skip_all => 'Injection points not supported by this build';
|
||||
}
|
||||
|
||||
# Initialize primary node. log_checkpoints is required as the checkpoint
|
||||
# activity is monitored based on the contents of the logs.
|
||||
my $node_primary = PostgreSQL::Test::Cluster->new('master');
|
||||
$node_primary->init(allows_streaming => 1);
|
||||
$node_primary->append_conf(
|
||||
'postgresql.conf', q[
|
||||
log_checkpoints = on
|
||||
restart_after_crash = on
|
||||
]);
|
||||
$node_primary->start;
|
||||
|
||||
my $backup_name = 'my_backup';
|
||||
$node_primary->backup($backup_name);
|
||||
|
||||
# Setup a standby.
|
||||
my $node_standby = PostgreSQL::Test::Cluster->new('standby1');
|
||||
$node_standby->init_from_backup($node_primary, $backup_name,
|
||||
has_streaming => 1);
|
||||
$node_standby->start;
|
||||
|
||||
# Dummy table for the upcoming tests.
|
||||
$node_primary->safe_psql('postgres', 'checkpoint');
|
||||
$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);');
|
||||
|
||||
# Register an injection point on the standby so as the follow-up
|
||||
# restart point will wait on it.
|
||||
$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
|
||||
# Wait until the extension has been created on the standby
|
||||
$node_primary->wait_for_replay_catchup($node_standby);
|
||||
|
||||
# Note that from this point the checkpointer will wait in the middle of
|
||||
# a restart point on the standby.
|
||||
$node_standby->safe_psql('postgres',
|
||||
"SELECT injection_points_attach('create-restart-point', 'wait');");
|
||||
|
||||
# Execute a restart point on the standby, that we will now be waiting on.
|
||||
# This needs to be in the background.
|
||||
my $logstart = -s $node_standby->logfile;
|
||||
my $psql_session =
|
||||
$node_standby->background_psql('postgres', on_error_stop => 0);
|
||||
$psql_session->query_until(
|
||||
qr/starting_checkpoint/, q(
|
||||
\echo starting_checkpoint
|
||||
CHECKPOINT;
|
||||
));
|
||||
|
||||
# Switch one WAL segment to make the previous restart point remove the
|
||||
# segment once the restart point completes.
|
||||
$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);');
|
||||
$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();');
|
||||
$node_primary->wait_for_replay_catchup($node_standby);
|
||||
|
||||
# Wait until the checkpointer is in the middle of the restart point
|
||||
# processing.
|
||||
ok( $node_standby->poll_query_until(
|
||||
'postgres',
|
||||
qq[SELECT count(*) FROM pg_stat_activity
|
||||
WHERE backend_type = 'checkpointer' AND wait_event = 'create-restart-point' ;],
|
||||
'1'),
|
||||
'checkpointer is waiting in restart point'
|
||||
) or die "Timed out while waiting for checkpointer to run restart point";
|
||||
|
||||
# Check the logs that the restart point has started on standby. This is
|
||||
# optional, but let's be sure.
|
||||
ok( $node_standby->log_contains(
|
||||
"restartpoint starting: immediate wait", $logstart),
|
||||
"restartpoint has started");
|
||||
|
||||
# Trigger promotion during the restart point.
|
||||
$node_primary->stop;
|
||||
$node_standby->promote;
|
||||
|
||||
# Update the start position before waking up the checkpointer!
|
||||
$logstart = -s $node_standby->logfile;
|
||||
|
||||
# Now wake up the checkpointer.
|
||||
$node_standby->safe_psql('postgres',
|
||||
"SELECT injection_points_wakeup('create-restart-point');");
|
||||
|
||||
# Wait until the previous restart point completes on the newly-promoted
|
||||
# standby, checking the logs for that.
|
||||
my $checkpoint_complete = 0;
|
||||
foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default)
|
||||
{
|
||||
if ($node_standby->log_contains("restartpoint complete"), $logstart)
|
||||
{
|
||||
$checkpoint_complete = 1;
|
||||
last;
|
||||
}
|
||||
usleep(100_000);
|
||||
}
|
||||
is($checkpoint_complete, 1, 'restart point has completed');
|
||||
|
||||
# Kill with SIGKILL, forcing all the backends to restart.
|
||||
my $psql_timeout = IPC::Run::timer(3600);
|
||||
my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
|
||||
my $killme = IPC::Run::start(
|
||||
[
|
||||
'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d',
|
||||
$node_standby->connstr('postgres')
|
||||
],
|
||||
'<',
|
||||
\$killme_stdin,
|
||||
'>',
|
||||
\$killme_stdout,
|
||||
'2>',
|
||||
\$killme_stderr,
|
||||
$psql_timeout);
|
||||
$killme_stdin .= q[
|
||||
SELECT pg_backend_pid();
|
||||
];
|
||||
$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
|
||||
my $pid = $killme_stdout;
|
||||
chomp($pid);
|
||||
$killme_stdout = '';
|
||||
$killme_stderr = '';
|
||||
|
||||
my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid);
|
||||
is($ret, 0, 'killed process with KILL');
|
||||
|
||||
# Wait until the server restarts, finish consuming output.
|
||||
$killme_stdin .= q[
|
||||
SELECT 1;
|
||||
];
|
||||
ok( pump_until(
|
||||
$killme,
|
||||
$psql_timeout,
|
||||
\$killme_stderr,
|
||||
qr/server closed the connection unexpectedly|connection to server was lost|could not send data to server/m
|
||||
),
|
||||
"psql query died successfully after SIGKILL");
|
||||
$killme->finish;
|
||||
|
||||
# Wait till server finishes restarting.
|
||||
$node_standby->poll_query_until('postgres', undef, '');
|
||||
|
||||
# After recovery, the server should be able to start.
|
||||
my $stdout;
|
||||
my $stderr;
|
||||
($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1');
|
||||
is($ret, 0, "psql connect success");
|
||||
is($stdout, 1, "psql select 1");
|
||||
|
||||
done_testing();
|
Loading…
Reference in New Issue