From 06108fd9743056e21416ff50c3479b9dc99181c2 Mon Sep 17 00:00:00 2001 From: Vadim Ponomarev Date: Sat, 13 Dec 2025 23:00:16 +0300 Subject: [PATCH] Fix WAL validation for multi-timeline recovery targets When validating a backup with --recovery-target-timeline different from the backup's timeline, pg_probackup incorrectly searched for WAL segments on the target timeline instead of the backup's timeline. Changes in C code: - validate_backup_wal_from_start_to_stop now always uses backup->tli for checking backup consistency (WAL from start_lsn to stop_lsn) - Added multi-timeline support in validate_wal: when target timeline differs from backup timeline, validation is performed in two phases: 1. From stop_lsn to switchpoint on backup's timeline 2. From switchpoint to target_lsn on target timeline - Pass target_tli to validate_wal in restore.c when specified Test fixes: - Remove @unittest.expectedFailure from test_validate_target_lsn - Add set_archiving for restored node to archive timeline 2 WAL - Fix switch_wal_segment call to use node_restored instead of node --- src/parsexlog.c | 69 ++++++++++++++++++++++++++++++++++++++---- src/restore.c | 3 +- tests/validate_test.py | 5 ++- 3 files changed, 67 insertions(+), 10 deletions(-) diff --git a/src/parsexlog.c b/src/parsexlog.c index 3dd591e5..94b36285 100644 --- a/src/parsexlog.c +++ b/src/parsexlog.c @@ -13,6 +13,7 @@ #include "pg_probackup.h" #include "access/transam.h" +#include "access/timeline.h" #include "catalog/pg_control.h" #include "commands/dbcommands_xlog.h" #include "catalog/storage_xlog.h" @@ -437,11 +438,11 @@ validate_wal(pgBackup *backup, const char *archivedir, join_path_components(backup_database_dir, backup->root_dir, DATABASE_DIR); join_path_components(backup_xlog_path, backup_database_dir, PG_XLOG_DIR); - validate_backup_wal_from_start_to_stop(backup, backup_xlog_path, tli, + validate_backup_wal_from_start_to_stop(backup, backup_xlog_path, backup->tli, wal_seg_size); } else - validate_backup_wal_from_start_to_stop(backup, (char *) archivedir, tli, + validate_backup_wal_from_start_to_stop(backup, (char *) archivedir, backup->tli, wal_seg_size); if (backup->status == BACKUP_STATUS_CORRUPT) @@ -486,10 +487,66 @@ validate_wal(pgBackup *backup, const char *archivedir, || (XRecOffIsValid(target_lsn) && last_rec.rec_lsn >= target_lsn)) all_wal = true; - all_wal = all_wal || - RunXLogThreads(archivedir, target_time, target_xid, target_lsn, - tli, wal_seg_size, backup->stop_lsn, - InvalidXLogRecPtr, true, validateXLogRecord, &last_rec, true); + /* + * If target timeline differs from backup timeline, we need to validate + * WAL across multiple timelines. Read WAL on backup's timeline up to + * the switchpoint, then continue on target timeline. + */ + if (backup->tli != tli) + { + parray *timelines; + XLogRecPtr switchpoint = InvalidXLogRecPtr; + int i; + + /* Read timeline history to find switchpoint */ + timelines = read_timeline_history(archivedir, tli, false); + if (timelines) + { + for (i = 0; i < parray_num(timelines); i++) + { + TimeLineHistoryEntry *entry = parray_get(timelines, i); + if (entry->tli == backup->tli) + { + switchpoint = entry->end; + break; + } + } + /* Cleanup timeline history */ + parray_walk(timelines, pg_free); + parray_free(timelines); + } + + if (switchpoint != InvalidXLogRecPtr && backup->stop_lsn < switchpoint) + { + /* First, validate WAL from stop_lsn to switchpoint on backup's timeline */ + all_wal = RunXLogThreads(archivedir, 0, InvalidTransactionId, + InvalidXLogRecPtr, backup->tli, wal_seg_size, + backup->stop_lsn, switchpoint, + true, validateXLogRecord, &last_rec, false); + if (all_wal) + { + /* Then validate from switchpoint to target on target timeline */ + all_wal = RunXLogThreads(archivedir, target_time, target_xid, target_lsn, + tli, wal_seg_size, switchpoint, + InvalidXLogRecPtr, true, validateXLogRecord, &last_rec, true); + } + } + else + { + /* switchpoint not found or stop_lsn >= switchpoint, use single timeline */ + all_wal = RunXLogThreads(archivedir, target_time, target_xid, target_lsn, + tli, wal_seg_size, backup->stop_lsn, + InvalidXLogRecPtr, true, validateXLogRecord, &last_rec, true); + } + } + else + { + /* Same timeline - simple case */ + all_wal = all_wal || + RunXLogThreads(archivedir, target_time, target_xid, target_lsn, + tli, wal_seg_size, backup->stop_lsn, + InvalidXLogRecPtr, true, validateXLogRecord, &last_rec, true); + } if (last_rec.rec_time > 0) time2iso(last_timestamp, lengthof(last_timestamp), timestamptz_to_time_t(last_rec.rec_time), false); diff --git a/src/restore.c b/src/restore.c index 0be151a9..bb88be92 100644 --- a/src/restore.c +++ b/src/restore.c @@ -625,7 +625,8 @@ do_restore_or_validate(InstanceState *instanceState, time_t target_backup_id, pg */ validate_wal(dest_backup, instanceState->instance_wal_subdir_path, rt->target_time, rt->target_xid, rt->target_lsn, - dest_backup->tli, instance_config.xlog_seg_size); + rt->target_tli ? rt->target_tli : dest_backup->tli, + instance_config.xlog_seg_size); } /* Orphanize every OK descendant of corrupted backup */ else diff --git a/tests/validate_test.py b/tests/validate_test.py index 9511810e..14f1a9f5 100644 --- a/tests/validate_test.py +++ b/tests/validate_test.py @@ -3434,8 +3434,6 @@ def test_validate_corrupt_tablespace_map(self): "\n Unexpected Error Message: {0}\n CMD: {1}".format(repr(e.message), self.cmd), ) - # TODO fix the test - @unittest.expectedFailure # @unittest.skip("skip") def test_validate_target_lsn(self): """ @@ -3468,11 +3466,12 @@ def test_validate_target_lsn(self): self.restore_node(backup_dir, "node", node_restored) + self.set_archiving(backup_dir, "node", node_restored) self.set_auto_conf(node_restored, {"port": node_restored.port}) node_restored.slow_start() - self.switch_wal_segment(node) + self.switch_wal_segment(node_restored) backup_id = self.backup_node(backup_dir, "node", node_restored, data_dir=node_restored.data_dir)