diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index eb02e3b6a6..c9c9b37fbb 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -100,6 +100,7 @@ #include "storage/sync.h" #include "utils/guc_hooks.h" #include "utils/guc_tables.h" +#include "utils/injection_point.h" #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/relmapper.h" @@ -7528,6 +7529,12 @@ CreateRestartPoint(int flags) CheckPointGuts(lastCheckPoint.redo, flags); + /* + * This location needs to be after CheckPointGuts() to ensure that some + * work has already happened during this checkpoint. + */ + INJECTION_POINT("create-restart-point"); + /* * Remember the prior checkpoint's redo ptr for * UpdateCheckPointDistanceEstimate() diff --git a/src/test/recovery/Makefile b/src/test/recovery/Makefile index 17ee353735..f57baba5e8 100644 --- a/src/test/recovery/Makefile +++ b/src/test/recovery/Makefile @@ -9,12 +9,17 @@ # #------------------------------------------------------------------------- -EXTRA_INSTALL=contrib/pg_prewarm contrib/pg_stat_statements contrib/test_decoding +EXTRA_INSTALL=contrib/pg_prewarm \ + contrib/pg_stat_statements \ + contrib/test_decoding \ + src/test/modules/injection_points subdir = src/test/recovery top_builddir = ../../.. include $(top_builddir)/src/Makefile.global +export enable_injection_points enable_injection_points + # required for 017_shm.pl and 027_stream_regress.pl REGRESS_SHLIB=$(abs_top_builddir)/src/test/regress/regress$(DLSUFFIX) export REGRESS_SHLIB diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index bf087ac2a9..c67249500e 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -6,6 +6,9 @@ tests += { 'bd': meson.current_build_dir(), 'tap': { 'test_kwargs': {'priority': 40}, # recovery tests are slow, start early + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, 'tests': [ 't/001_stream_rep.pl', 't/002_archiving.pl', @@ -46,6 +49,7 @@ tests += { 't/038_save_logical_slots_shutdown.pl', 't/039_end_of_wal.pl', 't/040_standby_failover_slots_sync.pl', + 't/041_checkpoint_at_promote.pl', ], }, } diff --git a/src/test/recovery/t/041_checkpoint_at_promote.pl b/src/test/recovery/t/041_checkpoint_at_promote.pl new file mode 100644 index 0000000000..1a6a8d86a1 --- /dev/null +++ b/src/test/recovery/t/041_checkpoint_at_promote.pl @@ -0,0 +1,170 @@ + +# Copyright (c) 2024, PostgreSQL Global Development Group + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Time::HiRes qw(usleep); +use Test::More; + +################################################## +# Test race condition when a restart point is running during a promotion, +# checking that WAL segments are correctly removed in the restart point +# while the promotion finishes. +# +# This test relies on an injection point that causes the checkpointer to +# wait in the middle of a restart point on a standby. The checkpointer +# is awaken to finish its restart point only once the promotion of the +# standby is completed, and the node should be able to restart properly. +################################################## + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize primary node. log_checkpoints is required as the checkpoint +# activity is monitored based on the contents of the logs. +my $node_primary = PostgreSQL::Test::Cluster->new('master'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf( + 'postgresql.conf', q[ +log_checkpoints = on +restart_after_crash = on +]); +$node_primary->start; + +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Setup a standby. +my $node_standby = PostgreSQL::Test::Cluster->new('standby1'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +# Dummy table for the upcoming tests. +$node_primary->safe_psql('postgres', 'checkpoint'); +$node_primary->safe_psql('postgres', 'CREATE TABLE prim_tab (a int);'); + +# Register an injection point on the standby so as the follow-up +# restart point will wait on it. +$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); +# Wait until the extension has been created on the standby +$node_primary->wait_for_replay_catchup($node_standby); + +# Note that from this point the checkpointer will wait in the middle of +# a restart point on the standby. +$node_standby->safe_psql('postgres', + "SELECT injection_points_attach('create-restart-point', 'wait');"); + +# Execute a restart point on the standby, that we will now be waiting on. +# This needs to be in the background. +my $logstart = -s $node_standby->logfile; +my $psql_session = + $node_standby->background_psql('postgres', on_error_stop => 0); +$psql_session->query_until( + qr/starting_checkpoint/, q( + \echo starting_checkpoint + CHECKPOINT; +)); + +# Switch one WAL segment to make the previous restart point remove the +# segment once the restart point completes. +$node_primary->safe_psql('postgres', 'INSERT INTO prim_tab VALUES (1);'); +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal();'); +$node_primary->wait_for_replay_catchup($node_standby); + +# Wait until the checkpointer is in the middle of the restart point +# processing. +ok( $node_standby->poll_query_until( + 'postgres', + qq[SELECT count(*) FROM pg_stat_activity + WHERE backend_type = 'checkpointer' AND wait_event = 'create-restart-point' ;], + '1'), + 'checkpointer is waiting in restart point' +) or die "Timed out while waiting for checkpointer to run restart point"; + +# Check the logs that the restart point has started on standby. This is +# optional, but let's be sure. +ok( $node_standby->log_contains( + "restartpoint starting: immediate wait", $logstart), + "restartpoint has started"); + +# Trigger promotion during the restart point. +$node_primary->stop; +$node_standby->promote; + +# Update the start position before waking up the checkpointer! +$logstart = -s $node_standby->logfile; + +# Now wake up the checkpointer. +$node_standby->safe_psql('postgres', + "SELECT injection_points_wakeup('create-restart-point');"); + +# Wait until the previous restart point completes on the newly-promoted +# standby, checking the logs for that. +my $checkpoint_complete = 0; +foreach my $i (0 .. 10 * $PostgreSQL::Test::Utils::timeout_default) +{ + if ($node_standby->log_contains("restartpoint complete"), $logstart) + { + $checkpoint_complete = 1; + last; + } + usleep(100_000); +} +is($checkpoint_complete, 1, 'restart point has completed'); + +# Kill with SIGKILL, forcing all the backends to restart. +my $psql_timeout = IPC::Run::timer(3600); +my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', ''); +my $killme = IPC::Run::start( + [ + 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', + $node_standby->connstr('postgres') + ], + '<', + \$killme_stdin, + '>', + \$killme_stdout, + '2>', + \$killme_stderr, + $psql_timeout); +$killme_stdin .= q[ +SELECT pg_backend_pid(); +]; +$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/; +my $pid = $killme_stdout; +chomp($pid); +$killme_stdout = ''; +$killme_stderr = ''; + +my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid); +is($ret, 0, 'killed process with KILL'); + +# Wait until the server restarts, finish consuming output. +$killme_stdin .= q[ +SELECT 1; +]; +ok( pump_until( + $killme, + $psql_timeout, + \$killme_stderr, + qr/server closed the connection unexpectedly|connection to server was lost|could not send data to server/m + ), + "psql query died successfully after SIGKILL"); +$killme->finish; + +# Wait till server finishes restarting. +$node_standby->poll_query_until('postgres', undef, ''); + +# After recovery, the server should be able to start. +my $stdout; +my $stderr; +($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1'); +is($ret, 0, "psql connect success"); +is($stdout, 1, "psql select 1"); + +done_testing();