diff --git a/.github/workflows/build-gpdb.yml b/.github/workflows/build-gpdb.yml index 6872dd66f32..d2214988812 100644 --- a/.github/workflows/build-gpdb.yml +++ b/.github/workflows/build-gpdb.yml @@ -53,7 +53,7 @@ jobs: }, { "name": "test", - "extra_configure_flags": "--enable-debug-extensions" + "extra_configure_flags": "--enable-debug-extensions --enable-tap-tests" } ]}' # Output the matrix for GitHub Actions @@ -168,6 +168,10 @@ jobs: }, {"test":"ic-mirrorless", "make_configs":["src/test/isolation2:installcheck-mirrorless"] + }, + {"test":"ic-recovery", + "install_target":true, + "make_configs":["src/test/recovery:installcheck"] } ] }' @@ -362,7 +366,7 @@ jobs: # set env for debian build export BUILD_DESTINATION=${SRC_DIR}/debian/build - if ! su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} CONFIGURE_EXTRA_OPTS=${{ matrix.extra_configure_flags }} BUILD_DESTINATION=${BUILD_DESTINATION} ${SRC_DIR}/../gpdb-devops/build_automation/gpdb/scripts/configure-gpdb.sh"; then + if ! su - gpadmin -c "cd ${SRC_DIR} && SRC_DIR=${SRC_DIR} ENABLE_DEBUG=${{ env.ENABLE_DEBUG }} CONFIGURE_EXTRA_OPTS='${{ matrix.extra_configure_flags }}' BUILD_DESTINATION=${BUILD_DESTINATION} ${SRC_DIR}/../gpdb-devops/build_automation/gpdb/scripts/configure-gpdb.sh"; then echo "::error::Configure script failed" exit 1 fi @@ -1016,7 +1020,7 @@ jobs: # set env for debian build export BUILD_DESTINATION=/opt/greenplum-db-6 - if ! su - gpadmin -c "cd ${SRC_DIR} && ./configure --with-perl --with-python --with-libxml --enable-mapreduce --with-gssapi --prefix=${BUILD_DESTINATION} --with-ldap --enable-gpperfmon --with-pam --with-openssl --disable-pxf --enable-ic-proxy --enable-orafce --without-mdblocales --with-zstd"; then + if ! su - gpadmin -c "cd ${SRC_DIR} && ./configure --with-perl --with-python --with-libxml --enable-mapreduce --with-gssapi --prefix=${BUILD_DESTINATION} --with-ldap --enable-gpperfmon --with-pam --with-openssl --disable-pxf --enable-ic-proxy --enable-orafce --without-mdblocales --with-zstd --enable-tap-tests"; then echo "::error::Configure script failed" exit 1 fi diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c index 2b8112a154b..46fd4c7cc07 100644 --- a/src/bin/pg_basebackup/pg_basebackup.c +++ b/src/bin/pg_basebackup/pg_basebackup.c @@ -112,6 +112,7 @@ static void progress_report(int tablespacenum, const char *filename, bool force) static void ReceiveTarFile(PGconn *conn, PGresult *res, int rownum); static void ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum); +static char *GetRestoreCommandHint(PQExpBufferData conninfo_buf); static void GenerateRecoveryConf(PGconn *conn); static void WriteRecoveryConf(void); static void BaseBackup(const char *argv0); @@ -1572,6 +1573,53 @@ escape_quotes(const char *src) #define GP_WALRECEIVER_APPNAME "gp_walreceiver" +/* + * GetRestoreCommandHint + * + * Query restore_command_hint GUC from primary via regular connection. + * Accepts regular connection info buffer. + * + * Returns GUC value, or NULL on failure. Caller must free the GUC and + * the connection info buffer. + */ +static char * +GetRestoreCommandHint(PQExpBufferData conninfo_buf) +{ + PGconn *regular_conn; + PGresult *restore_cmd_hint_res; + char *restore_cmd_hint = NULL; + + regular_conn = PQconnectdb(conninfo_buf.data); + + if (PQstatus(regular_conn) != CONNECTION_OK) + { + fprintf(stderr, _("%s: could not connect to primary: %s"), + progname, PQerrorMessage(regular_conn)); + PQfinish(regular_conn); + return NULL; + } + + restore_cmd_hint_res = PQexec(regular_conn, "SHOW restore_command_hint"); + + if (PQresultStatus(restore_cmd_hint_res) == PGRES_TUPLES_OK && + PQntuples(restore_cmd_hint_res) > 0 && + !PQgetisnull(restore_cmd_hint_res, 0, 0)) + { + restore_cmd_hint = pg_strdup(PQgetvalue(restore_cmd_hint_res, 0, 0)); + } + else + { + fprintf(stderr, _("%s: could not get restore_command_hint: %s\n"), + progname, PQerrorMessage(regular_conn)); + } + + PQclear(restore_cmd_hint_res); + PQfinish(regular_conn); + + return restore_cmd_hint; +} + + /* * Create a recovery.conf file in memory using a PQExpBuffer */ @@ -1582,6 +1630,7 @@ GenerateRecoveryConf(PGconn *conn) PQconninfoOption *option; PQExpBufferData conninfo_buf; char *escaped; + char *restore_cmd_hint; recoveryconfcontents = createPQExpBuffer(); if (!recoveryconfcontents) @@ -1628,6 +1677,8 @@ GenerateRecoveryConf(PGconn *conn) free(escaped); } + restore_cmd_hint = GetRestoreCommandHint(conninfo_buf); + appendPQExpBuffer(&conninfo_buf, " application_name=%s", GP_WALRECEIVER_APPNAME); /* * Escape the connection string, so that it can be put in the config file. @@ -1645,6 +1696,13 @@ GenerateRecoveryConf(PGconn *conn) free(escaped); } + if (restore_cmd_hint) + { + escaped = escape_quotes(restore_cmd_hint); + appendPQExpBuffer(recoveryconfcontents, "restore_command = '%s'\n", escaped); + free(escaped); + } + if (PQExpBufferBroken(recoveryconfcontents) || PQExpBufferDataBroken(conninfo_buf)) { diff --git a/src/bin/pg_rewind/libpq_fetch.c b/src/bin/pg_rewind/libpq_fetch.c index 6a91fe6fc26..bd97941a944 100644 --- a/src/bin/pg_rewind/libpq_fetch.c +++ b/src/bin/pg_rewind/libpq_fetch.c @@ -737,6 +737,13 @@ GenerateRecoveryConf(char *replication_slot) free(escaped); } + if (restore_command) + { + escaped = escape_quotes(restore_command); + appendPQExpBuffer(recoveryconfcontents, "restore_command = '%s'\n", escaped); + free(escaped); + } + if (PQExpBufferBroken(recoveryconfcontents) || PQExpBufferDataBroken(conninfo_buf)) { diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index b5fa82252f0..a82fbab1390 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -868,7 +868,7 @@ getRestoreCommand(const char *argv0) postgres_cmd[MAXPGPATH], cmd_output[MAXPGPATH]; - if (!restore_wal) + if (!(restore_wal || writerecoveryconf)) return; /* find postgres executable */ diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index 60efcc62f50..84e0e0f0b35 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -23,6 +23,7 @@ extern char *datadir_target; extern char *datadir_source; extern char *connstr_source; +extern char *restore_command; extern bool debug; extern bool showprogress; extern bool dry_run; diff --git a/src/bin/pg_rewind/sql/run_test.sh b/src/bin/pg_rewind/sql/run_test.sh old mode 100644 new mode 100755 index ead83fd7245..6d53cc2b550 --- a/src/bin/pg_rewind/sql/run_test.sh +++ b/src/bin/pg_rewind/sql/run_test.sh @@ -41,6 +41,7 @@ max_connections = 50 listen_addresses = '$LISTEN_ADDRESSES' port = $PORT_MASTER wal_keep_segments=5 +restore_command_hint='/bin/true' EOF # Accept replication connections on master @@ -141,20 +142,35 @@ if [ $TEST_SUITE == "local" ]; then --progress \ --debug \ --source-pgdata=$TEST_STANDBY \ - --target-pgdata=$TEST_MASTER >>$log_path 2>&1 + --target-pgdata=$TEST_MASTER >>$log_path 2>&1 \ + --write-recovery-conf elif [ $TEST_SUITE == "remote" ]; then # Do rewind using a remote connection as source PGOPTIONS=${PGOPTIONS_UTILITY} pg_rewind \ --progress \ --debug \ --source-server="port=$PORT_STANDBY dbname=postgres" \ - --target-pgdata=$TEST_MASTER >>$log_path 2>&1 + --target-pgdata=$TEST_MASTER >>$log_path 2>&1 \ + --write-recovery-conf else # Cannot come here normally echo "Incorrect test suite specified" exit 1 fi +# Check that recovery.conf exists and that restore_command is taken from restore_command_hint. +if [ "$TEST_SUITE" = "remote" ]; then + if [ ! -f "$TEST_MASTER/recovery.conf" ]; then + echo "recovery.conf file is missing." + exit 1 + fi + + if ! grep -qF "restore_command = '/bin/true'" "$TEST_MASTER/recovery.conf"; then + echo "Restore command was not found in recovery.conf" + exit 1 + fi +fi + # After rewind is done, restart the source node in local mode. if [ $TEST_SUITE == "local" ]; then pg_ctl -w -D $TEST_STANDBY start -o "$STANDBY_PG_CTL_OPTIONS" >>$log_path 2>&1 diff --git a/src/test/recovery/t/139_auto_restore_archive.pl b/src/test/recovery/t/139_auto_restore_archive.pl new file mode 100644 index 00000000000..844de80ff3e --- /dev/null +++ b/src/test/recovery/t/139_auto_restore_archive.pl @@ -0,0 +1,160 @@ +# Test for fixing timeline collision when restore_command is not written into +# the generated recovery.conf during recovery, resulting in the promoting node not seeing +# a newer timeline ID in the archive. Now restore_command is written to recovery.conf +# (sourced from the primary's restore_command_hint during recovery) + +# Steps: +# 1. Create master + standby, promote standby → timeline 2. +# 2. Stop standby, clean its datadir, rebuild from master. +# 3. Standby now on timeline 1, but timeline 2 exists in shared archive. +# 4. Promote standby again → should fetch timeline 2 from archive using +# restore_command and jump to timeline 3. + +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 6; +use File::Copy; +use File::Path qw(rmtree); + +# Initialize master with archiving and streaming +my $node_master = get_new_node('master'); +$node_master->init(has_archiving => 1, allows_streaming => 1); + +# Declare master’s archive for both master and standby +my $archive_dir = $node_master->archive_dir; + +# Append restore_command_hint to master's postgres.conf +my $path = TestLib::perl2host($archive_dir); +$path =~ s{\\}{\\\\}g if ($TestLib::windows_os); +my $restore_cmd = $TestLib::windows_os + ? qq{copy "$path\\\\%f" "%p"} + : qq{cp "$path/%f" "%p"}; + +$node_master->append_conf('postgresql.conf', + "restore_command_hint = '$restore_cmd'\n"); + +$node_master->start; + +# Create test data +$node_master->safe_psql('postgres', "CREATE TABLE test_tbl AS SELECT 1 AS id"); + +# Take backup for standby +my $backup_name = 'my_backup'; +$node_master->backup($backup_name); + +# Initialize standby from backup +my $node_standby = get_new_node('standby'); +$node_standby->init_from_backup($node_master, $backup_name, + has_streaming => 1, + has_restoring => 1); + +# Enable archiving on standby for timeline history archival +my $archive_cmd = $TestLib::windows_os + ? qq{copy "%p" "$archive_dir\\\\%f"} + : qq{cp "%p" "$archive_dir/%f"}; + +$node_standby->append_conf('postgresql.conf', qq{ +archive_mode = on +archive_command = '$archive_cmd' +}); + +# Disable hot standby as it is not supported +$node_standby->append_conf('postgresql.conf', "hot_standby = off\n"); + +# Start standby and wait for it to catch up +$node_standby->start; +my $target_lsn = $node_master->lsn('insert'); +$node_master->wait_for_catchup($node_standby, 'replay', $target_lsn); + +# First promotion: standby moves to timeline 2 +$node_standby->promote; +$node_standby->poll_query_until('postgres', "SELECT NOT pg_is_in_recovery()") + or die "Timed out waiting for first promotion"; + +# Force checkpoint and wait for timeline history file to be archived +$node_standby->safe_psql('postgres', "CHECKPOINT"); + +# Verify 00000002.history is archived +my $standby_archive = $node_standby->archive_dir; +$node_standby->poll_query_until('postgres', + "SELECT size IS NOT NULL FROM pg_stat_file('pg_xlog/archive_status/00000002.history.done', true);") + or die "Timed out waiting for 00000002.history to be archived"; + +ok(-f "$archive_dir/00000002.history", + 'timeline 2 history file archived after first promotion'); + +# Crash standby +$node_standby->stop('immediate'); + +# Rebuild standby from master using pg_basebackup +my $standby_datadir = $node_standby->data_dir; +File::Path::rmtree($standby_datadir); + +$ENV{PGOPTIONS} = '-c gp_session_role=utility'; +command_ok(['pg_basebackup', + '-D', $standby_datadir, + '-h', $node_master->host, + '-p', $node_master->port, + '--target-gp-dbid', '123', + '-x', + '--write-recovery-conf'], + 'pg_basebackup standby from master'); + +# Verify recovery.conf contains restore_command +my $recovery_conf = "$standby_datadir/recovery.conf"; +ok(-f $recovery_conf, 'recovery.conf exists after pg_basebackup'); + +my $recovery_conf_content = slurp_file($recovery_conf); +like($recovery_conf_content, qr/restore_command\s*=/, + 'recovery.conf contains restore_command'); + +# Restore configuration +$node_standby->append_conf('postgresql.conf', "hot_standby = off\n"); +$node_standby->append_conf('postgresql.conf', qq{ +archive_mode = on +archive_command = '$archive_cmd' +}); + +# Start standby (now back on timeline 1) +$node_standby->start; +sleep(2); + +# Kill master and promote standby again +$node_master->stop('immediate'); + +# Promote using pg_ctl directly since PostgresNode->promote() verification fails in mirror mode +# (after pg_basebackup we have gp_contentid=0 and it affects connection acceptance). +# Verify via logs. +my $pgdata = $node_standby->data_dir; +my $logfile = $node_standby->logfile; +print "### Promoting node \"" . $node_standby->name . "\"\n"; +TestLib::system_or_bail('pg_ctl', '-D', $pgdata, '-l', $logfile, 'promote'); + +# Wait for second promotion to complete +my $max_wait = 60; +my $waited = 0; +my $promotions = 0; + +while ($waited < $max_wait) { + my $log_content = slurp_file($logfile); + my @promotion_matches = $log_content =~ /database system is ready to accept connections/g; + $promotions = scalar @promotion_matches; + + last if $promotions >= 2; + + sleep(1); + $waited++; +} + +die "Second promotion did not complete in time (found $promotions ready messages)" if $promotions < 2; + +# Verify timeline 2 was selected only once and currently active timeline is 3 +my $log_content = slurp_file($logfile); +my @timeline_2_matches = $log_content =~ /selected new timeline ID: 2/g; +is(scalar @timeline_2_matches, 1, 'timeline 2 selected once'); + +my @timeline_3_matches = $log_content =~ /selected new timeline ID: 3/g; +is(scalar @timeline_3_matches, 1, 'timeline 3 selected once'); +