From 245784d7d15a316c51c965b6a91b8f3883ee4d28 Mon Sep 17 00:00:00 2001 From: Adrian Sutton Date: Mon, 18 May 2026 08:45:33 +1000 Subject: [PATCH 1/5] feat(op-supernode): rework interop activity startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the EL-finalized-head cold-start heuristic with a deterministic verifiedDB-resume / SafeDB-first-entry model. - Resume always wins. Any committed verifiedDB entry resumes at LastTimestamp+1 with no SafeDB or chain RPC consultation. - Cold start (no verifiedDB) waits for every chain to record a first SafeDB entry, then sets verificationStartTimestamp = max(activationTimestamp, max_c first-safe-head timestamp). Wall-clock time is never consulted; chain derivation progress is the only authoritative signal relative to activation. - Backfill lower bound is max(activation, per-chain genesis time, verificationStart - depth). Hard fails if any chain cannot serve the range. reconcileLogsDBTail runs only during cold-start backfill; warm-restart paths rely on DecisionRewind for drift handling. - Start splits into a fast init plus a stateful main loop. The loop drives both cold-start init and progressAndRecord, so Start never blocks on multi-day EL sync waits and per-iteration backoff / cancellation / observability come for free. - firstVerifiableTimestamp is now a synchronous accessor backed by verifiedDB.FirstTimestamp and verificationStartTimestamp; RPC handlers return ErrNotStarted while initialization is in progress. API surface: - SafeDBReader.FirstEntry on op-node, exposed through VirtualNode and ChainContainer (FirstSafeHeadTimestamp). ChainContainer reports ErrSafeDBEmpty during cold-start wait. Tests: - New startup_test.go covers fastInit resume / cold-start, the advanceColdStartInit state machine, the cold-start backfill no-op paths, the per-chain genesis clamp, and ErrNotStarted RPC semantics. - Deletes log_backfill_test.go and the obsolete EL-finalized-head startup tests in interop_test.go — replaced by the above. --- op-node/node/api.go | 3 + op-node/node/safedb/disabled.go | 5 + op-node/node/safedb/safedb.go | 24 + op-node/node/safedb/safedb_test.go | 58 + op-node/node/server_test.go | 9 + .../supernode/activity/interop/algo_test.go | 3 + .../supernode/activity/interop/interop.go | 196 ++-- .../activity/interop/interop_test.go | 409 +------- .../activity/interop/interop_test_access.go | 36 +- .../activity/interop/log_backfill.go | 186 ++-- .../activity/interop/log_backfill_test.go | 987 ------------------ .../supernode/activity/interop/logdb.go | 2 +- .../supernode/activity/interop/logdb_test.go | 6 +- .../activity/interop/startup_test.go | 355 +++++++ .../activity/supernode/supernode_test.go | 4 + .../activity/superroot/superroot_test.go | 3 + .../chain_container/chain_container.go | 25 + .../chain_container/chain_container_test.go | 8 + .../chain_container/invalidation_test.go | 3 + .../virtual_node/virtual_node.go | 17 + .../virtual_node/virtual_node_test.go | 16 + 21 files changed, 800 insertions(+), 1555 deletions(-) delete mode 100644 op-supernode/supernode/activity/interop/log_backfill_test.go create mode 100644 op-supernode/supernode/activity/interop/startup_test.go diff --git a/op-node/node/api.go b/op-node/node/api.go index 413896f443b..e9dffeca3f6 100644 --- a/op-node/node/api.go +++ b/op-node/node/api.go @@ -44,6 +44,9 @@ type driverClient interface { type SafeDBReader interface { SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) (l1 eth.BlockID, l2 eth.BlockID, err error) + // FirstEntry returns the lowest recorded (L1, L2 safe head) pair. + // Returns ErrNotFound when no entries exist yet. + FirstEntry(ctx context.Context) (l1 eth.BlockID, l2 eth.BlockID, err error) } type adminAPI struct { diff --git a/op-node/node/safedb/disabled.go b/op-node/node/safedb/disabled.go index 09ff5cf2433..2bcb41a362f 100644 --- a/op-node/node/safedb/disabled.go +++ b/op-node/node/safedb/disabled.go @@ -27,6 +27,11 @@ func (d *DisabledDB) SafeHeadAtL1(_ context.Context, _ uint64) (l1 eth.BlockID, return } +func (d *DisabledDB) FirstEntry(_ context.Context) (l1 eth.BlockID, safeHead eth.BlockID, err error) { + err = ErrNotEnabled + return +} + func (d *DisabledDB) SafeHeadReset(_ eth.L2BlockRef) error { return nil } diff --git a/op-node/node/safedb/safedb.go b/op-node/node/safedb/safedb.go index 0530a476a3b..6b902df4cce 100644 --- a/op-node/node/safedb/safedb.go +++ b/op-node/node/safedb/safedb.go @@ -171,6 +171,30 @@ func (d *SafeDB) SafeHeadReset(safeHead eth.L2BlockRef) error { } } +func (d *SafeDB) FirstEntry(ctx context.Context) (l1Block eth.BlockID, safeHead eth.BlockID, err error) { + d.m.RLock() + defer d.m.RUnlock() + if d.closed { + err = ErrClosed + return + } + iter, err := d.db.NewIterWithContext(ctx, safeByL1BlockNumKey.IterRange()) + if err != nil { + return + } + defer iter.Close() + if valid := iter.First(); !valid { + err = ErrNotFound + return + } + val, err := iter.ValueAndErr() + if err != nil { + return + } + l1Block, safeHead, err = decodeSafeByL1BlockNum(iter.Key(), val) + return +} + func (d *SafeDB) SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) (l1Block eth.BlockID, safeHead eth.BlockID, err error) { d.m.RLock() defer d.m.RUnlock() diff --git a/op-node/node/safedb/safedb_test.go b/op-node/node/safedb/safedb_test.go index 03fae503c0f..b2b2c6d4b2a 100644 --- a/op-node/node/safedb/safedb_test.go +++ b/op-node/node/safedb/safedb_test.go @@ -83,6 +83,64 @@ func TestSafeHeadAtL1_EmptyDatabase(t *testing.T) { require.ErrorIs(t, err, ErrNotFound) } +func TestFirstEntry_EmptyDatabase(t *testing.T) { + logger := testlog.Logger(t, log.LvlInfo) + dir := t.TempDir() + db, err := NewSafeDB(logger, dir) + require.NoError(t, err) + defer db.Close() + _, _, err = db.FirstEntry(context.Background()) + require.ErrorIs(t, err, ErrNotFound) +} + +func TestFirstEntry_ReturnsLowestL1(t *testing.T) { + logger := testlog.Logger(t, log.LvlInfo) + dir := t.TempDir() + db, err := NewSafeDB(logger, dir) + require.NoError(t, err) + defer db.Close() + + l2a := eth.L2BlockRef{Hash: common.Hash{0x02, 0xaa}, Number: 20} + l2b := eth.L2BlockRef{Hash: common.Hash{0x02, 0xbb}, Number: 25} + l1a := eth.BlockID{Hash: common.Hash{0x01, 0xaa}, Number: 100} + l1b := eth.BlockID{Hash: common.Hash{0x01, 0xbb}, Number: 150} + + // Insert out of order to confirm we return the lowest L1 block, not the + // first-inserted entry. + require.NoError(t, db.SafeHeadUpdated(l2b, l1b)) + require.NoError(t, db.SafeHeadUpdated(l2a, l1a)) + + actualL1, actualL2, err := db.FirstEntry(context.Background()) + require.NoError(t, err) + require.Equal(t, l1a, actualL1) + require.Equal(t, l2a.ID(), actualL2) +} + +func TestFirstEntry_StableAfterResetAhead(t *testing.T) { + logger := testlog.Logger(t, log.LvlInfo) + dir := t.TempDir() + db, err := NewSafeDB(logger, dir) + require.NoError(t, err) + defer db.Close() + + l1a := eth.BlockID{Hash: common.Hash{0x01, 0xaa}, Number: 100} + l1b := eth.BlockID{Hash: common.Hash{0x01, 0xbb}, Number: 150} + l2a := eth.L2BlockRef{Hash: common.Hash{0x02, 0xaa}, Number: 20, L1Origin: l1a} + l2b := eth.L2BlockRef{Hash: common.Hash{0x02, 0xbb}, Number: 25, L1Origin: l1b} + + require.NoError(t, db.SafeHeadUpdated(l2a, l1a)) + require.NoError(t, db.SafeHeadUpdated(l2b, l1b)) + + // Reset to l2b truncates entries at or after l2b; the l2a entry remains + // and must still be the first. + require.NoError(t, db.SafeHeadReset(l2b)) + + actualL1, actualL2, err := db.FirstEntry(context.Background()) + require.NoError(t, err) + require.Equal(t, l1a, actualL1) + require.Equal(t, l2a.ID(), actualL2) +} + func TestTruncateOnSafeHeadReset(t *testing.T) { logger := testlog.Logger(t, log.LvlInfo) dir := t.TempDir() diff --git a/op-node/node/server_test.go b/op-node/node/server_test.go index ca15b02161b..8a755a38612 100644 --- a/op-node/node/server_test.go +++ b/op-node/node/server_test.go @@ -344,3 +344,12 @@ func (m *mockSafeDBReader) SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) func (m *mockSafeDBReader) ExpectSafeHeadAtL1(l1BlockNum uint64, l1 eth.BlockID, safeHead eth.BlockID, err error) { m.Mock.On("SafeHeadAtL1", l1BlockNum).Return(l1, safeHead, &err) } + +func (m *mockSafeDBReader) FirstEntry(ctx context.Context) (l1 eth.BlockID, l2 eth.BlockID, err error) { + r := m.Mock.MethodCalled("FirstEntry") + return r[0].(eth.BlockID), r[1].(eth.BlockID), *r[2].(*error) +} + +func (m *mockSafeDBReader) ExpectFirstEntry(l1 eth.BlockID, safeHead eth.BlockID, err error) { + m.Mock.On("FirstEntry").Return(l1, safeHead, &err) +} diff --git a/op-supernode/supernode/activity/interop/algo_test.go b/op-supernode/supernode/activity/interop/algo_test.go index ee68d6c06ca..785c837f5e8 100644 --- a/op-supernode/supernode/activity/interop/algo_test.go +++ b/op-supernode/supernode/activity/interop/algo_test.go @@ -1073,6 +1073,9 @@ type algoMockChain struct { func (m *algoMockChain) BlockNumberToTimestamp(ctx context.Context, blocknum uint64) (uint64, error) { return 0, nil } +func (m *algoMockChain) FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) { + return 0, cc.ErrSafeDBEmpty +} func (m *algoMockChain) ID() eth.ChainID { return m.id } func (m *algoMockChain) Start(ctx context.Context) error { return nil } func (m *algoMockChain) Stop(ctx context.Context) error { return nil } diff --git a/op-supernode/supernode/activity/interop/interop.go b/op-supernode/supernode/activity/interop/interop.go index 07dea21e8d7..3ada95e3981 100644 --- a/op-supernode/supernode/activity/interop/interop.go +++ b/op-supernode/supernode/activity/interop/interop.go @@ -143,13 +143,19 @@ type Interop struct { chains map[eth.ChainID]cc.InteropChain activationTimestamp uint64 // immutable protocol activation timestamp - // backfillEndTimestamp represents the end of the range of timestamps that were sealed by runLogBackfill. - // this is used for loop handoff from log backfill to main processing. - // firstVerifiableTimestamp is used to determine the start of the main processing loop, which is backfillEndTimestamp + 1 - // after backfill, or the EL-finalized-derived startup timestamp when backfill was not used. - backfillEndTimestamp uint64 - firstVerifiableSet bool - firstVerifiable uint64 + // verificationStartTimestamp is the first L2 timestamp the main loop + // attempts to verify. Set exactly once during fastInit (resume or + // future-activation paths) or by advanceColdStartInit, then immutable. + verificationStartTimestamp uint64 + + // initialized is set true once verificationStartTimestamp has been + // chosen. RPC accessors return ErrNotStarted while false. + initialized atomic.Bool + + // waitingForSync is true between fastInit deferring cold-start origin + // selection and the loop iteration that completes it. Only read/written + // by the main loop goroutine; no mutex needed. + waitingForSync bool dataDir string @@ -198,28 +204,21 @@ func (i *Interop) Name() string { return "interop" } -// firstVerifiableTimestamp is the earliest timestamp the main loop will attempt -// to verify. If verification has already committed results, the first committed -// timestamp is the durable handoff boundary. Otherwise it is backfillEndTimestamp+1 -// after log backfill, or — on cold start with no committed results and no -// backfill range — the EL-finalized-derived startup timestamp. -func (i *Interop) firstVerifiableTimestamp(ctx context.Context) (uint64, error) { +// firstVerifiableTimestamp is the earliest timestamp the verifier covers. +// If commits exist, the verifiedDB's first committed timestamp is the +// authoritative lower bound (it cannot move). Otherwise it is the chosen +// verificationStartTimestamp. Returns ErrNotStarted until initialization +// completes. +func (i *Interop) firstVerifiableTimestamp() (uint64, error) { if i.verifiedDB != nil { if first, initialized := i.verifiedDB.FirstTimestamp(); initialized { return first, nil } } - if i.backfillEndTimestamp != 0 { - next := i.backfillEndTimestamp + 1 - if next < i.activationTimestamp { - return i.activationTimestamp, nil - } - return next, nil + if !i.initialized.Load() { + return 0, ErrNotStarted } - if i.firstVerifiableSet { - return i.firstVerifiable, nil - } - return i.resolveFirstVerifiableTimestamp(ctx) + return i.verificationStartTimestamp, nil } // New constructs a new Interop activity. @@ -292,106 +291,79 @@ func (i *Interop) Start(ctx context.Context) error { i.started = true i.mu.Unlock() - if i.logBackfillDepth > 0 { - i.log.Info("interop log backfill depth configured", "duration", i.logBackfillDepth.String()) - for { - i.backfillAttempts.Add(1) - end, err := i.runLogBackfill() - if err == nil { - i.backfillEndTimestamp = end - break - } - i.log.Warn("log backfill failed, retrying (EL finalized head or chain data may not be ready yet)", "err", err) - for cid := range i.chains { - i.metrics.LogBackfillRetries.WithLabelValues(cid.String()).Inc() - } - select { - case <-i.ctx.Done(): - return fmt.Errorf("log backfill interrupted: %w", i.ctx.Err()) - case <-time.After(errorBackoffPeriod): - } - } - } - i.backfillCompleted.Store(true) - i.log.Info("log backfill complete", "backfillEndTimestamp", i.backfillEndTimestamp) + i.fastInit() + return i.runLoop() +} - firstVerifiableLog := uint64(0) - if i.backfillEndTimestamp != 0 { - firstVerifiableLog = i.backfillEndTimestamp + 1 - if firstVerifiableLog < i.activationTimestamp { - firstVerifiableLog = i.activationTimestamp - } - } else if lastTS, initialized := i.verifiedDB.LastTimestamp(); initialized { - // Resume from the last commit to keep verifiedDB gap-free. - firstVerifiableLog = lastTS + 1 - } else { - for { - first, err := i.readyFirstVerifiableTimestamp(i.ctx) - if err == nil { - i.firstVerifiable = first - i.firstVerifiableSet = true - firstVerifiableLog = first - break - } - // Permanent SafeDB gap must halt normal startup cleanly. Backfill-enabled - // startup reaches this path only if backfill had no range to seal. - if errors.Is(err, cc.ErrHistoryUnavailable) { - i.log.Error("interop activity halted: SafeDB history unavailable on this node", "err", err, - "remediation", "reseed data dir, advance interop.activation-timestamp past the gap, or rederive from L1") - return fmt.Errorf("interop halted due to unavailable history: %w", err) - } - i.log.Warn("first verifiable timestamp unavailable, retrying (EL finalized head or chain data may not be ready yet)", "err", err) - select { - case <-i.ctx.Done(): - return fmt.Errorf("first verifiable timestamp interrupted: %w", i.ctx.Err()) - case <-time.After(errorBackoffPeriod): - } - } +// fastInit selects verificationStartTimestamp from verifiedDB if any commit +// exists. Otherwise it defers to the cold-start loop, which waits for every +// chain to record a first SafeDB entry before picking an origin. Wall-clock +// time is not consulted: chain derivation progress is the only authoritative +// signal for "where we are" relative to activation. +func (i *Interop) fastInit() { + if lastTS, ok := i.verifiedDB.LastTimestamp(); ok { + i.verificationStartTimestamp = lastTS + 1 + i.initialized.Store(true) + i.log.Info("interop resuming from verifiedDB", + "verificationStartTimestamp", i.verificationStartTimestamp, + "activationTimestamp", i.activationTimestamp) + return } - i.log.Info("interop first verifiable timestamp resolved", - "activationTimestamp", i.activationTimestamp, - "firstVerifiableTimestamp", firstVerifiableLog) + i.waitingForSync = true + i.log.Info("interop cold start; waiting for SafeDB entries on every chain", + "activationTimestamp", i.activationTimestamp) +} +// runLoop drives initialization and verification. When waitingForSync is +// true the loop calls advanceColdStartInit each iteration until the cold +// start completes; otherwise it calls progressAndRecord. +func (i *Interop) runLoop() error { for { select { case <-i.ctx.Done(): return i.ctx.Err() default: - madeProgress, err := i.progressAndRecord() + } + + if i.waitingForSync { + advanced, err := i.advanceColdStartInit() if err != nil { - // Permanent SafeDB gap: log once and halt — retrying cannot fix it. - if errors.Is(err, cc.ErrHistoryUnavailable) { - i.metrics.ActivityErrors.WithLabelValues("interop", "history_unavailable").Inc() - i.log.Error("interop activity halted: SafeDB history unavailable on this node", "err", err, - "remediation", "reseed data dir, advance interop.activation-timestamp past the gap, or rederive from L1") - return fmt.Errorf("interop halted due to unavailable history: %w", err) + i.metrics.ActivityErrors.WithLabelValues("interop", "cold_start_init").Inc() + i.log.Error("interop cold start failed", "err", err) + return fmt.Errorf("interop cold start init: %w", err) + } + if !advanced { + select { + case <-i.ctx.Done(): + return i.ctx.Err() + case <-time.After(backoffPeriod): } - i.metrics.ActivityErrors.WithLabelValues("interop", "progress").Inc() - i.log.Error("failed to progress and record interop", "err", err) - time.Sleep(errorBackoffPeriod) continue } - if !madeProgress { - // Chains not ready, back off before next attempt - time.Sleep(backoffPeriod) - } - // Otherwise: immediately ready for next iteration (aggressive catch-up) + i.waitingForSync = false + i.initialized.Store(true) + i.log.Info("interop cold start complete", + "activationTimestamp", i.activationTimestamp, + "verificationStartTimestamp", i.verificationStartTimestamp) } - } -} -// readyFirstVerifiableTimestamp resolves the first timestamp that still needs -// interop verification and proves every chain can serve the optimistic L2/L1 -// data needed to verify it. -func (i *Interop) readyFirstVerifiableTimestamp(ctx context.Context) (uint64, error) { - first, err := i.resolveFirstVerifiableTimestamp(ctx) - if err != nil { - return 0, err - } - if _, err := i.checkChainsReady(first); err != nil { - return 0, err + madeProgress, err := i.progressAndRecord() + if err != nil { + if errors.Is(err, cc.ErrHistoryUnavailable) { + i.metrics.ActivityErrors.WithLabelValues("interop", "history_unavailable").Inc() + i.log.Error("interop activity halted: SafeDB history unavailable on this node", "err", err, + "remediation", "reseed data dir, advance interop.activation-timestamp past the gap, or rederive from L1") + return fmt.Errorf("interop halted due to unavailable history: %w", err) + } + i.metrics.ActivityErrors.WithLabelValues("interop", "progress").Inc() + i.log.Error("failed to progress and record interop", "err", err) + time.Sleep(errorBackoffPeriod) + continue + } + if !madeProgress { + time.Sleep(backoffPeriod) + } } - return first, nil } // Stop stops the Interop activity. @@ -536,7 +508,7 @@ func (i *Interop) observeRound() (RoundObservation, error) { obs.LastVerified = &result obs.NextTimestamp = lastTS + 1 } else { - next, err := i.firstVerifiableTimestamp(i.ctx) + next, err := i.firstVerifiableTimestamp() if err != nil { return obs, err } @@ -798,7 +770,7 @@ func (i *Interop) buildRewindPlan(lastTS uint64) (RewindPlan, error) { plan.ResetAllChainsTo = &resetTo } - first, err := i.firstVerifiableTimestamp(i.ctx) + first, err := i.firstVerifiableTimestamp() if err != nil { return RewindPlan{}, err } @@ -1004,7 +976,7 @@ func (i *Interop) VerifiedAtTimestamp(ts uint64) (bool, error) { if ts < i.activationTimestamp { return true, nil } - firstVerifiable, err := i.firstVerifiableTimestamp(i.ctx) + firstVerifiable, err := i.firstVerifiableTimestamp() if err != nil { return false, err } @@ -1039,7 +1011,7 @@ func (i *Interop) VerifiedResultAtTimestamp(ts uint64) (VerifiedResult, eth.Bloc if i.ctx == nil { return VerifiedResult{}, eth.BlockID{}, ErrNotStarted } - firstVerifiable, err := i.firstVerifiableTimestamp(i.ctx) + firstVerifiable, err := i.firstVerifiableTimestamp() if err != nil { return VerifiedResult{}, eth.BlockID{}, fmt.Errorf("resolve first verifiable: %w", err) } diff --git a/op-supernode/supernode/activity/interop/interop_test.go b/op-supernode/supernode/activity/interop/interop_test.go index 39d7f285ff3..53f9fcfc12f 100644 --- a/op-supernode/supernode/activity/interop/interop_test.go +++ b/op-supernode/supernode/activity/interop/interop_test.go @@ -103,6 +103,18 @@ func (h *interopTestHarness) Build() *interopTestHarness { if h.interop != nil { h.interop.l1Checker = noopL1Checker{} h.interop.ctx = context.Background() + // Tests in this harness exercise progressAndRecord / verify paths + // directly. Fake cold-start init completion so synchronous accessors + // don't return ErrNotStarted. With chains configured the harness + // matches the legacy default (one past the default safe time, = + // activation+1); with no chains the activation itself is the start. + // Dedicated cold-start-init tests bypass the harness. + if len(chains) > 0 { + h.interop.verificationStartTimestamp = h.activationTime + 1 + } else { + h.interop.verificationStartTimestamp = h.activationTime + } + h.interop.initialized.Store(true) h.t.Cleanup(func() { _ = h.interop.Stop(context.Background()) }) } return h @@ -185,346 +197,6 @@ func TestInteropActivationTimestampFlagEnvVar(t *testing.T) { require.Contains(t, InteropActivationTimestampFlag.GetEnvVars(), "OP_SUPERNODE_INTEROP_ACTIVATION_TIMESTAMP") } -/* -Spec: firstVerifiableTimestamp is the common interop startup readiness gate. - - - If verifiedDB is initialized, it returns the first committed timestamp. - - It returns an error while the EL finalized head is unavailable or unset. - - Once ready, it returns activation when there are no chains, or one past the - minimum EL finalized timestamp across chains otherwise. - - The no-backfill startup path uses that same timestamp for its first - verification attempt. -*/ -func TestFirstVerifiableTimestamp(t *testing.T) { - tests := []struct { - name string - setup func(*interopTestHarness) *interopTestHarness - want uint64 - wantErr bool - }{ - { - name: "no chains returns activation", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100).Build() - }, - want: 100, - }, - { - name: "EL finalized error blocks startup", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHeadErr = errors.New("EL finalized not ready") - }). - Build() - }, - wantErr: true, - }, - { - name: "empty EL finalized response blocks startup", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHeadSet = true - }). - Build() - }, - wantErr: true, - }, - { - name: "EL finalized at genesis with a real hash is accepted", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 0, Hash: common.HexToHash("0xabc123")} - m.elFinalizedHeadSet = true - }). - Build() - }, - want: 100, - }, - { - name: "EL finalized before activation returns activation", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 99, Time: 99}, - LocalSafeL2: eth.L2BlockRef{Number: 99, Time: 99}, - } - }). - Build() - }, - want: 100, - }, - { - name: "EL finalized at activation returns timestamp after activation", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 100, Time: 100}, - LocalSafeL2: eth.L2BlockRef{Number: 100, Time: 100}, - } - }). - Build() - }, - want: 101, - }, - { - name: "returns timestamp after minimum EL finalized across chains", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 140, Time: 140}, - LocalSafeL2: eth.L2BlockRef{Number: 140, Time: 140}, - } - }). - WithChain(20, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - LocalSafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - } - }). - Build() - }, - want: 126, - }, - { - name: "uses EL finalized when sync status safe is stale", - setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 125, Time: 125} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 0, Time: 100}, - LocalSafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - } - }). - Build() - }, - want: 126, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - h := newInteropTestHarness(t) - tc.setup(h) - - got, err := h.interop.firstVerifiableTimestamp(context.Background()) - if tc.wantErr { - require.Error(t, err) - return - } - require.NoError(t, err) - require.Equal(t, tc.want, got) - }) - } -} - -func TestReadyFirstVerifiableTimestamp(t *testing.T) { - tests := []struct { - name string - configure func(*mockChainContainer) - want uint64 - wantErr error - }{ - { - name: "probes optimistic data at selected timestamp", - configure: func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - LocalSafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - } - }, - want: 126, - }, - { - name: "optimistic not found leaves startup unresolved", - configure: func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - LocalSafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - } - m.optimisticAtErr = ethereum.NotFound - }, - wantErr: ethereum.NotFound, - }, - { - name: "history unavailable surfaces permanent gap", - configure: func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - LocalSafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - } - m.optimisticAtErr = cc.ErrHistoryUnavailable - }, - wantErr: cc.ErrHistoryUnavailable, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - h := newInteropTestHarness(t). - WithActivation(100). - WithChain(10, tc.configure). - Build() - - got, err := h.interop.readyFirstVerifiableTimestamp(context.Background()) - if tc.wantErr != nil { - require.ErrorIs(t, err, tc.wantErr) - return - } - require.NoError(t, err) - require.Equal(t, tc.want, got) - require.Equal(t, int32(1), h.Mock(10).callsCompleted.Load()) - require.Equal(t, tc.want, h.Mock(10).lastRequestedTimestamp) - }) - } -} - -func TestStartWithoutBackfillUsesFirstVerifiableTimestamp(t *testing.T) { - const activation = uint64(100) - const safe = uint64(125) - - h := newInteropTestHarness(t). - WithActivation(activation). - WithChain(10, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: safe, Time: safe}, - LocalSafeL2: eth.L2BlockRef{Number: safe, Time: safe}, - } - }). - Build() - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - verifiedTS := make(chan uint64, 1) - h.interop.verifyFn = func(ts uint64, blocks map[eth.ChainID]eth.BlockID, _ map[eth.ChainID]eth.BlockID, _ *frontierVerificationView) (Result, error) { - verifiedTS <- ts - cancel() - return Result{}, nil - } - - done := make(chan error, 1) - go func() { done <- h.interop.Start(ctx) }() - - select { - case ts := <-verifiedTS: - require.Equal(t, safe+1, ts) - case <-time.After(5 * time.Second): - t.Fatal("interop did not attempt verification") - } - - require.ErrorIs(t, <-done, context.Canceled) -} - -// Warm restart with no backfill must resume from verifiedDB without consulting -// EL finalized. -func TestStartWithoutBackfillResumesFromVerifiedDBIgnoringELFinalized(t *testing.T) { - const ( - activation uint64 = 100 - lastVerified uint64 = 195 - ) - - var elFinalizedCalls atomic.Int32 - h := newInteropTestHarness(t). - WithActivation(activation). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHeadOverride = func() (eth.L2BlockRef, error) { - elFinalizedCalls.Add(1) - return eth.L2BlockRef{Number: 190, Time: 190}, nil - } - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - LocalSafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - } - }). - Build() - - chain10 := h.Mock(10) - for ts := activation + 1; ts <= lastVerified; ts++ { - require.NoError(t, h.interop.verifiedDB.Commit(VerifiedResult{ - Timestamp: ts, - L1Inclusion: eth.BlockID{Number: 1, Hash: common.HexToHash("0xL1")}, - L2Heads: map[eth.ChainID]eth.BlockID{chain10.id: {Number: ts, Hash: common.BigToHash(new(big.Int).SetUint64(ts))}}, - })) - } - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - verifiedTS := make(chan uint64, 1) - h.interop.verifyFn = func(ts uint64, blocks map[eth.ChainID]eth.BlockID, _ map[eth.ChainID]eth.BlockID, _ *frontierVerificationView) (Result, error) { - verifiedTS <- ts - cancel() - return Result{}, nil - } - - done := make(chan error, 1) - go func() { done <- h.interop.Start(ctx) }() - - select { - case ts := <-verifiedTS: - require.Equal(t, lastVerified+1, ts, - "startup must resume verification at verifiedDB.LastTimestamp+1, not gate on EL finalized") - case <-time.After(5 * time.Second): - t.Fatal("interop did not start verifying despite initialized verifiedDB") - } - - require.ErrorIs(t, <-done, context.Canceled) - require.Zero(t, elFinalizedCalls.Load(), - "warm-restart startup must not consult EL finalized when verifiedDB is initialized") -} - -func TestStartWithBackfillRunsBeforeSafeDBReadyCheck(t *testing.T) { - const activation = uint64(100) - const safe = uint64(125) - - h := newInteropTestHarness(t). - WithActivation(activation). - WithLogBackfillDepth(5*time.Second). - WithChain(10, func(m *mockChainContainer) { - m.currentL1 = eth.BlockRef{Number: 100, Hash: common.HexToHash("0x1")} - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: safe, Time: safe}, - LocalSafeL2: eth.L2BlockRef{Number: safe, Time: safe}, - } - m.optimisticAtErr = cc.ErrHistoryUnavailable - }). - Build() - - done := make(chan error, 1) - go func() { done <- h.interop.Start(context.Background()) }() - - var err error - select { - case err = <-done: - case <-time.After(5 * time.Second): - t.Fatal("interop did not halt after post-backfill SafeDB readiness failure") - } - require.ErrorIs(t, err, cc.ErrHistoryUnavailable) - require.Equal(t, int32(1), h.interop.BackfillAttempts()) - require.Equal(t, safe, h.interop.BackfillEndTimestamp()) - - first, err := h.interop.FirstSealedBlock(eth.ChainIDFromUInt64(10)) - require.NoError(t, err) - // The first real backfilled block is 120, and the logs DB records its - // virtual parent as the first sealed block. - require.Equal(t, uint64(119), first.Number) - require.Equal(t, uint64(120), first.Timestamp) - - latest, ok, err := h.interop.LatestSealedBlock(eth.ChainIDFromUInt64(10)) - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, safe, latest.Number) - require.Equal(t, safe, latest.Timestamp) -} - // ============================================================================= // TestStartStop // ============================================================================= @@ -633,11 +305,16 @@ func TestStartStop(t *testing.T) { }, { // Permanent SafeDB gap must halt Start cleanly (no retry loop). + // Cold-start init completes (first safe head timestamp available), + // then progressAndRecord sees ErrHistoryUnavailable from + // OptimisticAt and halts immediately. name: "Start halts when a chain reports ErrHistoryUnavailable", setup: func(h *interopTestHarness) *interopTestHarness { return h.WithChain(10, func(m *mockChainContainer) { m.currentL1 = eth.BlockRef{Number: 100, Hash: common.HexToHash("0x1")} m.blockAtTimestamp = eth.L2BlockRef{Number: 50} + m.firstSafeHeadTimestamp = h.activationTime + m.firstSafeHeadTimestampSet = true m.optimisticAtErr = cc.ErrHistoryUnavailable }).Build() }, @@ -1138,14 +815,11 @@ func TestVerifiedAtTimestamp(t *testing.T) { { name: "startup handoff timestamps are verified", setup: func(h *interopTestHarness) *interopTestHarness { - return h.WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - LocalSafeL2: eth.L2BlockRef{Number: 125, Time: 125}, - } - }). + h.WithActivation(100). + WithChain(10, func(m *mockChainContainer) {}). Build() + h.interop.verificationStartTimestamp = 126 + return h }, run: func(t *testing.T, h *interopTestHarness) { verified, err := h.interop.VerifiedAtTimestamp(125) @@ -1221,15 +895,11 @@ func TestVerifiedResultAtTimestamp(t *testing.T) { t.Run("post-activation but below firstVerifiable returns ErrBeforeVerifiedDB", func(t *testing.T) { h := newInteropTestHarness(t). WithActivation(100). - WithChain(10, func(m *mockChainContainer) { - // EL finalized time 500 -> firstVerifiable=501. ts=200 is post - // activation but below firstVerifiable on this node. - m.syncStatusFull = ð.SyncStatus{ - SafeL2: eth.L2BlockRef{Number: 500, Time: 500}, - LocalSafeL2: eth.L2BlockRef{Number: 500, Time: 500}, - } - }). + WithChain(10, func(m *mockChainContainer) {}). Build() + // Cold start would have picked verificationStart >= 501 once the first + // SafeDB entry was at L2 time 500; ts=200 is post-activation but below. + h.interop.verificationStartTimestamp = 501 _, _, err := h.interop.VerifiedResultAtTimestamp(200) require.ErrorIs(t, err, ErrBeforeVerifiedDB) require.NotErrorIs(t, err, ErrNotActive) @@ -1315,14 +985,15 @@ func TestFirstVerifiableTimestampRestoresSafeHeadHandoffAfterRestart(t *testing. require.NotNil(t, interop) defer func() { require.NoError(t, interop.Stop(context.Background())) }() - first, err := interop.firstVerifiableTimestamp(context.Background()) + first, err := interop.firstVerifiableTimestamp() require.NoError(t, err) require.Equal(t, uint64(126), first) - interop.backfillEndTimestamp = 200 - first, err = interop.firstVerifiableTimestamp(context.Background()) + interop.verificationStartTimestamp = 201 + interop.initialized.Store(true) + first, err = interop.firstVerifiableTimestamp() require.NoError(t, err) - require.Equal(t, uint64(126), first, "persisted verifier lower bound takes precedence over restart backfill") + require.Equal(t, uint64(126), first, "persisted verifier lower bound takes precedence over restart resume point") verified, err := interop.VerifiedAtTimestamp(125) require.NoError(t, err) @@ -1731,6 +1402,8 @@ func TestInterop_FullCycle(t *testing.T) { require.NotNil(t, interop) interop.l1Checker = noopL1Checker{} interop.ctx = context.Background() + interop.verificationStartTimestamp = 101 + interop.initialized.Store(true) // Verify logsDB is empty initially _, hasBlocks := interop.logsDBs[mock.id].LatestSealedBlock() @@ -2077,6 +1750,13 @@ type mockChainContainer struct { // time (and per-block timestamps) through BlockNumberToTimestamp. Used // by tests that exercise the genesis-clamp path in runLogBackfill. blockNumberToTimestampOverride func(ctx context.Context, blocknum uint64) (uint64, error) + + // firstSafeHeadTimestamp lets tests stub FirstSafeHeadTimestamp. + // firstSafeHeadTimestampErr defaults to chain_container.ErrSafeDBEmpty + // when neither field is set so the cold-start init loop keeps waiting. + firstSafeHeadTimestamp uint64 + firstSafeHeadTimestampSet bool + firstSafeHeadTimestampErr error } type invalidateBlockCall struct { @@ -2110,6 +1790,17 @@ func (m *mockChainContainer) BlockNumberToTimestamp(ctx context.Context, blocknu } return 0, nil } +func (m *mockChainContainer) FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) { + m.mu.Lock() + defer m.mu.Unlock() + if m.firstSafeHeadTimestampErr != nil { + return 0, m.firstSafeHeadTimestampErr + } + if m.firstSafeHeadTimestampSet { + return m.firstSafeHeadTimestamp, nil + } + return 0, cc.ErrSafeDBEmpty +} func (m *mockChainContainer) ELFinalizedHead(ctx context.Context) (eth.L2BlockRef, error) { m.mu.Lock() defer m.mu.Unlock() diff --git a/op-supernode/supernode/activity/interop/interop_test_access.go b/op-supernode/supernode/activity/interop/interop_test_access.go index 12988a10d60..edcd209f53a 100644 --- a/op-supernode/supernode/activity/interop/interop_test_access.go +++ b/op-supernode/supernode/activity/interop/interop_test_access.go @@ -6,7 +6,6 @@ package interop // in this file so the boundary is easy to audit. import ( - "context" "fmt" "github.com/ethereum-optimism/optimism/op-service/eth" @@ -37,19 +36,19 @@ func (i *Interop) Resume() { // Backfill observability // --------------------------------------------------------------------------- -// BackfillAttempts returns the number of times runLogBackfill has been +// BackfillAttempts returns the number of times advanceColdStartInit has been // invoked since the most recent Start. Integration tests use it to confirm -// the retry loop has engaged. +// the cold-start retry loop has engaged. func (i *Interop) BackfillAttempts() int32 { return i.backfillAttempts.Load() } -// BackfillCompleted reports whether the log backfill phase has finished -// (either ran and returned nil, or was skipped because logBackfillDepth -// was 0). Integration tests use it to gate assertions on downstream state -// until backfill is done. +// BackfillCompleted reports whether cold-start initialization has finished +// successfully (or was never needed because resume or future-activation +// initialization fired). Integration tests gate assertions on downstream +// state until this is true. func (i *Interop) BackfillCompleted() bool { - return i.backfillCompleted.Load() + return i.backfillCompleted.Load() || (i.initialized.Load() && !i.waitingForSync) } // --------------------------------------------------------------------------- @@ -63,18 +62,21 @@ func (i *Interop) ActivationTimestamp() uint64 { return i.activationTimestamp } -// BackfillEndTimestamp returns the inclusive last timestamp whose logs were -// sealed by runLogBackfill, or 0 if backfill has not run. The main loop -// starts verification at BackfillEndTimestamp()+1 (or ActivationTimestamp() -// when backfill was skipped). -func (i *Interop) BackfillEndTimestamp() uint64 { - return i.backfillEndTimestamp +// VerificationStartTimestamp returns the L2 timestamp at which the main loop +// begins verification on the most recent Start. Returns 0 before +// initialization completes. +func (i *Interop) VerificationStartTimestamp() uint64 { + if !i.initialized.Load() { + return 0 + } + return i.verificationStartTimestamp } -// FirstVerifiableTimestamp returns the timestamp at which the main loop begins -// verification. It is intended for tests after startup has completed. +// FirstVerifiableTimestamp returns the lowest timestamp the verifier covers +// (verifiedDB.FirstTimestamp when commits exist, else +// VerificationStartTimestamp). Returns 0 before initialization completes. func (i *Interop) FirstVerifiableTimestamp() uint64 { - ts, err := i.firstVerifiableTimestamp(context.Background()) + ts, err := i.firstVerifiableTimestamp() if err != nil { return 0 } diff --git a/op-supernode/supernode/activity/interop/log_backfill.go b/op-supernode/supernode/activity/interop/log_backfill.go index d76abcf253b..c44f9755e6e 100644 --- a/op-supernode/supernode/activity/interop/log_backfill.go +++ b/op-supernode/supernode/activity/interop/log_backfill.go @@ -2,114 +2,147 @@ package interop import ( "context" + "errors" "fmt" - "math" "sync" "github.com/ethereum-optimism/optimism/op-service/eth" cc "github.com/ethereum-optimism/optimism/op-supernode/supernode/chain_container" ) -// resolveFirstVerifiableTimestamp returns the first timestamp not yet covered -// by durable local state: verifiedDB.LastTimestamp+1 when initialized, -// otherwise the minimum EL finalized head + 1 (clamped to activation). -func (i *Interop) resolveFirstVerifiableTimestamp(ctx context.Context) (uint64, error) { - if len(i.chains) == 0 { - return i.activationTimestamp, nil - } - if i.verifiedDB != nil { - if lastTS, initialized := i.verifiedDB.LastTimestamp(); initialized { - return lastTS + 1, nil - } - } - minELFinalizedTime, err := i.minELFinalizedTime(ctx) +// advanceColdStartInit runs one best-effort pass at cold-start initialization: +// it collects every chain's first SafeDB entry timestamp, picks +// verificationStartTimestamp = max(activation, max_c T_c), runs backfill, and +// signals advance=true on success. Returns advance=false when any chain's +// SafeDB is still empty (caller backs off and retries). Errors from the +// backfill phase are fatal. +func (i *Interop) advanceColdStartInit() (bool, error) { + i.backfillAttempts.Add(1) + + perChainTS, ready, err := i.collectFirstSafeHeadTimestamps() if err != nil { - return 0, err + return false, err } - if minELFinalizedTime < i.activationTimestamp { - return i.activationTimestamp, nil + if !ready { + return false, nil } - return minELFinalizedTime + 1, nil -} -func (i *Interop) minELFinalizedTime(ctx context.Context) (uint64, error) { - if len(i.chains) == 0 { - return i.activationTimestamp, nil + verificationStart := i.activationTimestamp + for _, ts := range perChainTS { + if ts > verificationStart { + verificationStart = ts + } } + i.verificationStartTimestamp = verificationStart + // Flip initialized before backfill: backfill seals into logsDB, and + // sealBlockDataIntoLogsDB queries firstVerifiableTimestamp to validate + // the timestamp gap. That accessor returns ErrNotStarted while + // initialized is false. + i.initialized.Store(true) - minELFinalizedTime := uint64(math.MaxUint64) + if err := i.runColdStartBackfill(verificationStart); err != nil { + return false, fmt.Errorf("backfill: %w", err) + } + i.backfillCompleted.Store(true) + return true, nil +} + +// collectFirstSafeHeadTimestamps queries every chain's SafeDB for its first +// entry timestamp in parallel. Returns ready=false (without error) if any +// chain has no entries yet; the caller backs off and retries. Other errors +// are reported as-is. +func (i *Interop) collectFirstSafeHeadTimestamps() (map[eth.ChainID]uint64, bool, error) { + type res struct { + id eth.ChainID + ts uint64 + err error + } + results := make(chan res, len(i.chains)) for _, chain := range i.chains { - elFinalized, err := chain.ELFinalizedHead(ctx) - if err != nil { - return 0, fmt.Errorf("chain %s: EL finalized head: %w", chain.ID(), err) - } - // Genesis (Number == 0) with a real hash is a legitimate finalized head; - // only reject the zero-value response from an EL that isn't ready yet. - if elFinalized == (eth.L2BlockRef{}) { - return 0, fmt.Errorf("chain %s: EL finalized head not yet available", chain.ID()) + go func(c cc.ChainContainer) { + ts, err := c.FirstSafeHeadTimestamp(i.ctx) + results <- res{id: c.ID(), ts: ts, err: err} + }(chain) + } + out := make(map[eth.ChainID]uint64, len(i.chains)) + var firstErr error + emptyAny := false + for range i.chains { + r := <-results + if r.err != nil { + if errors.Is(r.err, cc.ErrSafeDBEmpty) { + emptyAny = true + i.log.Debug("interop cold start: chain SafeDB empty, waiting", "chain", r.id) + continue + } + if firstErr == nil { + firstErr = fmt.Errorf("chain %s: first safe head timestamp: %w", r.id, r.err) + } + continue } - i.log.Debug("first verifiable timestamp: EL finalized head", - "chain", chain.ID(), "elFinalized", elFinalized) - minELFinalizedTime = min(minELFinalizedTime, elFinalized.Time) + out[r.id] = r.ts + } + if firstErr != nil { + return nil, false, firstErr } - return minELFinalizedTime, nil + if emptyAny { + return nil, false, nil + } + return out, true, nil } -func (i *Interop) runLogBackfill() (uint64, error) { +// runColdStartBackfill seals logs over the configured backfill window leading +// up to verificationStart. The per-chain lower bound is +// max(activationTimestamp, perChainGenesisTime, verificationStart - depth). +// Returns nil if logBackfillDepth is zero or no chains are configured. +func (i *Interop) runColdStartBackfill(verificationStart uint64) error { if i.logBackfillDepth <= 0 { - return 0, nil + return nil } if len(i.chains) == 0 { - return 0, nil - } - - firstVerifiable := i.firstVerifiable - if !i.firstVerifiableSet { - var err error - firstVerifiable, err = i.resolveFirstVerifiableTimestamp(i.ctx) - if err != nil { - return 0, err - } + return nil } - if firstVerifiable == i.activationTimestamp { - return 0, nil + if verificationStart == 0 { + return fmt.Errorf("invalid verificationStartTimestamp 0 for backfill") } - endTime := firstVerifiable - 1 + endTime := verificationStart - 1 - // naively, end minus depth is the ideal backfill start. - // guard the subtraction so a young chain (EL finalized < depth) doesn't wrap. depthSec := uint64(i.logBackfillDepth.Seconds()) - var idealStart uint64 + var depthFloor uint64 if endTime >= depthSec { - idealStart = endTime - depthSec + depthFloor = endTime - depthSec + } + commonStart := max(depthFloor, i.activationTimestamp) + if commonStart > endTime { + return nil } - // clamp to the activation timestamp: never backfill before activation. - startTime := max(idealStart, i.activationTimestamp) - // backfill every chain in parallel over [startTime, endTime] errCh := make(chan error, len(i.chains)) wg := sync.WaitGroup{} wg.Add(len(i.chains)) for _, chain := range i.chains { go func(chain cc.ChainContainer) { defer wg.Done() - chainStartTime := startTime - // if we can identify the genesis time, use it to clamp the start time - // if we can't, we'd either fail now or later when trying to use the value - if genesisTime, err := chain.BlockNumberToTimestamp(i.ctx, 0); err == nil && - genesisTime > startTime { - chainStartTime = genesisTime + chainStart := commonStart + genesisTime, err := chain.BlockNumberToTimestamp(i.ctx, 0) + if err != nil { + errCh <- fmt.Errorf("chain %s: genesis timestamp: %w", chain.ID(), err) + return + } + if genesisTime > chainStart { + chainStart = genesisTime + } + if chainStart > endTime { + return } - startNum, err := chain.TimestampToBlockNumber(i.ctx, chainStartTime) + startNum, err := chain.TimestampToBlockNumber(i.ctx, chainStart) if err != nil { - errCh <- fmt.Errorf("chain %s: timestamp to block number for start %d: %w", chain.ID(), startTime, err) - i.log.Error("log backfill: timestamp to block number for start", "chain", chain.ID(), "err", err) + errCh <- fmt.Errorf("chain %s: timestamp to block number for start %d: %w", chain.ID(), chainStart, err) return } endNum, err := chain.TimestampToBlockNumber(i.ctx, endTime) if err != nil { errCh <- fmt.Errorf("chain %s: timestamp to block number for end %d: %w", chain.ID(), endTime, err) - i.log.Error("log backfill: timestamp to block number for end", "chain", chain.ID(), "err", err) return } i.log.Info("log backfill: sealing logs", @@ -123,25 +156,26 @@ func (i *Interop) runLogBackfill() (uint64, error) { wg.Wait() close(errCh) for err := range errCh { - return 0, err + return err } - return endTime, nil + return nil } +// backfillChain seals every canonical block in [startNum, endNum] into the +// chain's logsDB. Calls reconcileLogsDBTail first to drop any tail that +// diverged from canonical while the supernode was offline (only meaningful +// during cold-start backfill: a verifiedDB-resume path never enters here). func (i *Interop) backfillChain(ctx context.Context, cid eth.ChainID, chain cc.ChainContainer, startNum, endNum uint64) error { db := i.logsDBs[cid] - // This is a startup best-effort repair for pre-existing logsDB reorg drift, - // separate from the normal interop observation/apply loop. It does not close - // the window where an L2 reorg lands after reconciliation/backfill and before - // normal interop persists its first frontier block. In that case the write path - // fails with ErrParentHashMismatch or ErrStaleLogsDB instead of appending - // inconsistent logs. if err := i.reconcileLogsDBTail(ctx, cid, chain, db); err != nil { return err } if latest, has := db.LatestSealedBlock(); has { startNum = latest.Number + 1 } + if startNum > endNum { + return nil + } totalBlocks := endNum - startNum + 1 for num := startNum; num <= endNum; num++ { out, err := chain.OutputV0AtBlockNumber(ctx, num) @@ -188,8 +222,6 @@ func (i *Interop) reconcileLogsDBTail(ctx context.Context, cid eth.ChainID, chai return fmt.Errorf("chain %s: first sealed block during reconcile: %w", cid, err) } - // Walk back from latest.Number-1 looking for the deepest sealed block whose - // hash still matches canonical. latest itself is already known to diverge. for n := latest.Number; n > first.Number; { n-- seal, err := db.FindSealedBlock(n) diff --git a/op-supernode/supernode/activity/interop/log_backfill_test.go b/op-supernode/supernode/activity/interop/log_backfill_test.go deleted file mode 100644 index 71cd4ca3912..00000000000 --- a/op-supernode/supernode/activity/interop/log_backfill_test.go +++ /dev/null @@ -1,987 +0,0 @@ -package interop - -import ( - "context" - "errors" - "math/big" - "sync/atomic" - "testing" - "time" - - "github.com/ethereum-optimism/optimism/op-service/eth" - "github.com/ethereum/go-ethereum/common" - "github.com/stretchr/testify/require" -) - -// progressInteropUntil calls progressAndRecord up to maxIters times until cond() is true. -func progressInteropUntil(t *testing.T, i *Interop, maxIters int, cond func() bool) { - t.Helper() - for range maxIters { - if cond() { - return - } - _, err := i.progressAndRecord() - require.NoError(t, err) - } -} - -func requireFirstVerifiableTimestamp(t *testing.T, i *Interop, want uint64, msgAndArgs ...interface{}) { - t.Helper() - got, err := i.firstVerifiableTimestamp(context.Background()) - require.NoError(t, err) - require.Equal(t, want, got, msgAndArgs...) -} - -func TestLogBackfill_ResumesAfterInterruption(t *testing.T) { - const act = uint64(100) - depth := 10 * time.Second // EL finalized 110, depth 10s -> T_lo 100; should seal 100..110 - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.currentL1 = eth.BlockRef{Number: 1, Hash: common.HexToHash("0xL1")} - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - } - }). - Build() - h.interop.ctx = context.Background() - - // Simulate a previous partial run: seal blocks 100..105 into the logsDB. - chain10 := h.Mock(10) - for num := uint64(100); num <= 105; num++ { - out, err := chain10.OutputV0AtBlockNumber(context.Background(), num) - require.NoError(t, err) - bid := eth.BlockID{Hash: out.BlockHash, Number: num} - blockInfo, receipts, err := chain10.FetchReceipts(context.Background(), bid) - require.NoError(t, err) - err = h.interop.sealBlockDataIntoLogsDB(chain10.id, bid, blockInfo, receipts, blockInfo.Time(), true) - require.NoError(t, err) - } - - latest, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, uint64(105), latest.Number) - - // Track how many OutputV0 calls happen during backfill to confirm we - // don't re-fetch blocks 100..105. - var fetchCount atomic.Int32 - chain10.outputV0Override = func(ctx context.Context, num uint64) (*eth.OutputV0, error) { - fetchCount.Add(1) - return ð.OutputV0{ - StateRoot: eth.Bytes32(common.HexToHash("0xmockstate")), - MessagePasserStorageRoot: eth.Bytes32(common.HexToHash("0xmockmsg")), - BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), - }, nil - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - h.interop.backfillEndTimestamp = end - require.Equal(t, uint64(110), end, - "runLogBackfill must return minELFinalizedTime as the end of the sealed range") - requireFirstVerifiableTimestamp(t, h.interop, 111, - "main loop resumes at backfillEndTimestamp+1") - require.Equal(t, act, h.interop.activationTimestamp, "protocol activation must not change") - - latest, has = h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, uint64(110), latest.Number) - - // 5 fetches for blocks 106..110 + 1 reconcile probe at block 105. - require.Equal(t, int32(6), fetchCount.Load()) -} - -func TestLogBackfill_RetriesWhenELFinalizedNotReady(t *testing.T) { - const act = uint64(100) - depth := 10 * time.Second - - // Track EL finalized head call count so we can make the first N calls fail. - var elFinalizedCalls atomic.Int32 - failUntil := int32(3) // first 3 calls return error, then succeed - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.currentL1 = eth.BlockRef{Number: 1, Hash: common.HexToHash("0xL1")} - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - } - m.elFinalizedHeadOverride = func() (eth.L2BlockRef, error) { - n := elFinalizedCalls.Add(1) - if n <= failUntil { - return eth.L2BlockRef{}, errors.New("EL finalized not ready") - } - return eth.L2BlockRef{Number: 110, Time: 110}, nil - } - }). - Build() - - // Use a shorter backoff for tests. - origBackoff := errorBackoffPeriod - errorBackoffPeriod = 10 * time.Millisecond - t.Cleanup(func() { errorBackoffPeriod = origBackoff }) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - done := make(chan error, 1) - go func() { done <- h.interop.Start(ctx) }() - - // Wait for backfill to complete: backfillEndTimestamp should be set - // to the end of the sealed range (110). - require.Eventually(t, func() bool { - return h.interop.backfillEndTimestamp > 0 - }, 5*time.Second, 20*time.Millisecond, "backfill should eventually succeed after retries") - - require.GreaterOrEqual(t, elFinalizedCalls.Load(), failUntil, - "EL finalized head should have been called at least %d times (the failing ones)", failUntil) - require.Equal(t, uint64(110), h.interop.backfillEndTimestamp) - requireFirstVerifiableTimestamp(t, h.interop, 111) - require.Equal(t, act, h.interop.activationTimestamp, "protocol activation must not change") - - cancel() - <-done -} - -// TestLogBackfill_RecoversFromOfflineReorg tests an L2 reorg that -// invalidates a sealed block while supernode is offline self-heals on -// restart, not loop forever on ErrParentHashMismatch. -func TestLogBackfill_RecoversFromOfflineReorg(t *testing.T) { - const act = uint64(100) - depth := 20 * time.Second - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.currentL1 = eth.BlockRef{Number: 1, Hash: common.HexToHash("0xL1")} - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - } - }). - Build() - - chain10 := h.Mock(10) - db := h.interop.logsDBs[chain10.id] - - // Pre-seed blocks 100..105 with a stale "v1" fork hash; the mock's canonical - // view ("v2") returns BigToHash(n). - v1Hash := func(n uint64) common.Hash { - return common.BigToHash(new(big.Int).SetUint64(n | 0xdead0000)) - } - require.NoError(t, db.SealBlock(common.Hash{}, - eth.BlockID{Number: 100, Hash: v1Hash(100)}, 100)) - for n := uint64(101); n <= 105; n++ { - require.NoError(t, db.SealBlock(v1Hash(n-1), - eth.BlockID{Number: n, Hash: v1Hash(n)}, n)) - } - before, has := db.LatestSealedBlock() - require.True(t, has) - require.Equal(t, uint64(105), before.Number) - require.Equal(t, v1Hash(105), before.Hash) - - origBackoff := errorBackoffPeriod - errorBackoffPeriod = 10 * time.Millisecond - t.Cleanup(func() { errorBackoffPeriod = origBackoff }) - - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - done := make(chan error, 1) - go func() { done <- h.interop.Start(ctx) }() - - require.Eventually(t, func() bool { - return h.interop.backfillCompleted.Load() - }, 15*time.Second, 20*time.Millisecond, - "Start must recover from an offline reorg, not loop forever on ErrParentHashMismatch") - - require.Equal(t, uint64(110), h.interop.backfillEndTimestamp) - - // Assert specific heights, not LatestSealedBlock: once backfill completes - // the main loop may seal further blocks before we read state. - canonicalHash := func(n uint64) common.Hash { - return common.BigToHash(new(big.Int).SetUint64(n)) - } - seal110, err := db.FindSealedBlock(110) - require.NoError(t, err, "backfill tip must be sealed") - require.Equal(t, canonicalHash(110), seal110.Hash, - "backfill tip must hold the canonical hash, not a stale v1 hash") - seal103, err := db.FindSealedBlock(103) - require.NoError(t, err) - require.Equal(t, canonicalHash(103), seal103.Hash, - "reorged interior block must be replaced with the canonical hash") - seal100, err := db.FindSealedBlock(100) - require.NoError(t, err) - require.Equal(t, canonicalHash(100), seal100.Hash, - "activation block must be replaced with the canonical hash") - - cancel() - <-done -} - -func TestLogBackfill_RetriesStopOnContextCancel(t *testing.T) { - const act = uint64(100) - depth := 10 * time.Second - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - // SyncStatus always fails — backfill will retry forever. - m.currentL1Err = errors.New("virtual node not ready") - }). - Build() - - origBackoff := errorBackoffPeriod - errorBackoffPeriod = 10 * time.Millisecond - t.Cleanup(func() { errorBackoffPeriod = origBackoff }) - - ctx, cancel := context.WithCancel(context.Background()) - - done := make(chan error, 1) - go func() { done <- h.interop.Start(ctx) }() - - // Let it retry a few times, then cancel. - time.Sleep(100 * time.Millisecond) - cancel() - - select { - case err := <-done: - require.ErrorIs(t, err, context.Canceled) - case <-time.After(5 * time.Second): - t.Fatal("Start did not return after context cancellation") - } -} - -// TestLogBackfill_AsymmetricMultiChain asserts that every chain is backfilled -// over the same [T_lo, minELFinalizedTime] window regardless of how far -// individual chains' EL finalized heads have advanced. This keeps the system -// symmetric at startup — every chain has logs sealed up to the same -// timestamp, matching the invariant the main loop observes during normal -// operation. Chains whose EL finalized head is beyond minELFinalizedTime catch -// up through the main loop, not eagerly during backfill. -// -// - T_lo is derived from min(EL finalized head time) across chains. -// - End of backfill for every chain is TimestampToBlockNumber(minELFinalizedTime). -// - backfillEndTimestamp is set to minELFinalizedTime; the main loop -// resumes at backfillEndTimestamp+1. -func TestLogBackfill_AsymmetricMultiChain(t *testing.T) { - const act = uint64(50) - depth := 10 * time.Second // min EL finalized 110 -> T_lo 100 - - // Chain 10: EL finalized tip at 120. - // Chain 20: EL finalized tip at 130. - // Chain 30: EL finalized 110 (the min, pinning T_lo). - // Every chain backfills 100..110 (the shared shape), so each seals 11 - // blocks regardless of how far its EL finalized tip is. - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 120, Time: 120} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 120, Time: 120}, - SafeL2: eth.L2BlockRef{Number: 120, Time: 120}, - LocalSafeL2: eth.L2BlockRef{Number: 120, Time: 120}, - } - }). - WithChain(20, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 130, Time: 130} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 130, Time: 130}, - SafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - LocalSafeL2: eth.L2BlockRef{Number: 130, Time: 130}, - } - }). - WithChain(30, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 110, Time: 110} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 100, Time: 100}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - } - }). - Build() - h.interop.ctx = context.Background() - - fetchCount := make(map[eth.ChainID]*atomic.Int32, 3) - for _, id := range []uint64{10, 20, 30} { - c := h.Mock(id) - counter := new(atomic.Int32) - fetchCount[c.id] = counter - c.outputV0Override = func(ctx context.Context, num uint64) (*eth.OutputV0, error) { - counter.Add(1) - return ð.OutputV0{ - StateRoot: eth.Bytes32(common.HexToHash("0xmockstate")), - MessagePasserStorageRoot: eth.Bytes32(common.HexToHash("0xmockmsg")), - BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), - }, nil - } - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - h.interop.backfillEndTimestamp = end - require.Equal(t, act, h.interop.activationTimestamp, "protocol activation must not change") - require.Equal(t, uint64(110), end, - "runLogBackfill must return minELFinalizedTime as the end of the sealed range") - requireFirstVerifiableTimestamp(t, h.interop, 111, - "main loop resumes at backfillEndTimestamp+1") - - chain10 := h.Mock(10) - chain20 := h.Mock(20) - chain30 := h.Mock(30) - - // Every chain backfills the same 100..110 window (11 blocks each). - require.Equal(t, int32(11), fetchCount[chain10.id].Load(), - "chain 10 should backfill blocks 100..110 (11 blocks)") - require.Equal(t, int32(11), fetchCount[chain20.id].Load(), - "chain 20 should backfill blocks 100..110 (11 blocks)") - require.Equal(t, int32(11), fetchCount[chain30.id].Load(), - "chain 30 should backfill blocks 100..110 (11 blocks)") - - latest10, has10 := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has10) - require.Equal(t, uint64(110), latest10.Number) - - latest20, has20 := h.interop.logsDBs[chain20.id].LatestSealedBlock() - require.True(t, has20) - require.Equal(t, uint64(110), latest20.Number) - - latest30, has30 := h.interop.logsDBs[chain30.id].LatestSealedBlock() - require.True(t, has30) - require.Equal(t, uint64(110), latest30.Number) -} - -// TestLogBackfill_MisalignedActivation asserts that backfill succeeds when -// the protocol activation timestamp does not land on a (genesis + k*blockTime) -// boundary. In this configuration TargetBlockNumber(activation) floors to the -// last block whose Time() is strictly before activation: that block -// represents the chain state as of the fork and is the correct pairing -// anchor for the first post-activation block. An overly strict -// "first seal must be >= activation" check would reject this block and the -// retry loop would spin forever with a misleading "virtual nodes may not be -// ready" log line. -// -// Concrete setup: blockTime=3, genesis=0, activation=1000. Block 333 has -// Time()=999 (the pairing anchor); block 334 is at 1002; LocalSafe is at -// block 340, Time=1020. T_lo clamps to activation so backfill must seal -// blocks 333..340 without error. backfillEndTimestamp is set to 1020 -// (minELFinalizedTime), so the main loop resumes at 1021. -func TestLogBackfill_MisalignedActivation(t *testing.T) { - const ( - blockTime uint64 = 3 - act uint64 = 1000 - localSafeNum uint64 = 340 - localSafeTs uint64 = 1020 // 340 * blockTime - ) - depth := 60 * time.Second // EL finalized 1020 - 60 = 960 < activation → T_lo clamps to 1000 - - blockNumToTime := func(num uint64) uint64 { return num * blockTime } - tsToBlockNum := func(ctx context.Context, ts uint64) (uint64, error) { - return ts / blockTime, nil // floor, matches rollup.TargetBlockNumber - } - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.blockTimeOverride = blockTime - m.blockInfoTimeFn = blockNumToTime - m.timestampToBlockNumberOverride = tsToBlockNum - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: localSafeNum, Time: localSafeTs}, - SafeL2: eth.L2BlockRef{Number: localSafeNum, Time: localSafeTs}, - LocalSafeL2: eth.L2BlockRef{Number: localSafeNum, Time: localSafeTs}, - } - }). - Build() - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - h.interop.backfillEndTimestamp = end - require.Equal(t, act, h.interop.activationTimestamp, "protocol activation must not change") - require.Equal(t, localSafeTs, end, - "runLogBackfill must return minELFinalizedTime as the end of the sealed range") - requireFirstVerifiableTimestamp(t, h.interop, localSafeTs+1) - - chain10 := h.Mock(10) - db := h.interop.logsDBs[chain10.id] - - // processBlockLogs seals a "virtual parent" before the first real backfill - // block so subsequent blocks have a parent to link against. For the first - // backfill block at number 333, that virtual parent is at number 332 with - // the real block's Time() (999). FirstSealedBlock therefore returns the - // virtual parent — the anchor — and the invariant we care about is that - // its timestamp is strictly pre-activation but within one blockTime of it. - first, err := db.FirstSealedBlock() - require.NoError(t, err) - require.Equal(t, uint64(332), first.Number, "first sealed block is the virtual parent of TargetBlockNumber(activation)") - require.Equal(t, uint64(999), first.Timestamp, - "anchor's Time() is the real block's time, strictly pre-activation — this is the pairing anchor, not a violation") - require.Less(t, first.Timestamp, act, "sanity: anchor is strictly pre-activation") - require.Greater(t, first.Timestamp+blockTime, act, - "anchor must still be within one blockTime of activation (the anchor window)") - - latest, has := db.LatestSealedBlock() - require.True(t, has) - require.Equal(t, localSafeNum, latest.Number) -} - -func TestLogBackfill_AdvancesActivationAndStartsVerifyAfterCeiling(t *testing.T) { - const act = uint64(108) - depth := time.Second // EL finalized 110, depth 1s -> T_lo 109; seals 109..110; first verifiable ts = 111 - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.currentL1 = eth.BlockRef{Number: 1, Hash: common.HexToHash("0xL1")} - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - } - }). - Build() - - var verifyCalls atomic.Int32 - var firstVerifyTS atomic.Uint64 - h.interop.verifyFn = func(ts uint64, blocks map[eth.ChainID]eth.BlockID, _ map[eth.ChainID]eth.BlockID, _ *frontierVerificationView) (Result, error) { - if verifyCalls.Add(1) == 1 { - firstVerifyTS.Store(ts) - } - return Result{ - Timestamp: ts, - L1Inclusion: eth.BlockID{Number: 1, Hash: common.HexToHash("0xL1")}, - L2Heads: blocks, - }, nil - } - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - h.interop.backfillEndTimestamp = end - require.Equal(t, uint64(110), end, - "runLogBackfill must return minELFinalizedTime as the end of the sealed range") - requireFirstVerifiableTimestamp(t, h.interop, 111, - "main loop resumes at backfillEndTimestamp+1") - require.Equal(t, act, h.interop.activationTimestamp, "protocol activation must not change") - - chain10 := h.Mock(10) - latest, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, uint64(110), latest.Number) - require.Zero(t, verifyCalls.Load()) - - // Progress the main loop — first verify should be at 111 (activation after backfill). - progressInteropUntil(t, h.interop, 10, func() bool { - lastTS, ok := h.interop.verifiedDB.LastTimestamp() - return ok && lastTS >= 111 - }) - lastTS, ok := h.interop.verifiedDB.LastTimestamp() - require.True(t, ok) - require.GreaterOrEqual(t, lastTS, uint64(111)) - require.Equal(t, int32(1), verifyCalls.Load()) - require.Equal(t, uint64(111), firstVerifyTS.Load()) -} - -// TestLogBackfill_NoOpWhenDepthZero asserts that runLogBackfill short-circuits -// when logBackfillDepth is zero: it must return (0, nil) without touching -// SyncStatus, TimestampToBlockNumber, or the logs DB. This is the "feature -// disabled" path — operators who don't want backfill get no work done. -func TestLogBackfill_NoOpWhenDepthZero(t *testing.T) { - const act = uint64(100) - - var syncStatusCalls atomic.Int32 - var outputCalls atomic.Int32 - - h := newInteropTestHarness(t). - WithActivation(act). - // no WithLogBackfillDepth → depth stays at zero-value. - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: 110, Time: 110} - m.elFinalizedHeadSet = true - m.syncStatusOverride = func() (*eth.SyncStatus, error) { - syncStatusCalls.Add(1) - return ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - SafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - LocalSafeL2: eth.L2BlockRef{Number: 110, Time: 110}, - }, nil - } - m.outputV0Override = func(ctx context.Context, num uint64) (*eth.OutputV0, error) { - outputCalls.Add(1) - return ð.OutputV0{ - StateRoot: eth.Bytes32(common.HexToHash("0xmockstate")), - MessagePasserStorageRoot: eth.Bytes32(common.HexToHash("0xmockmsg")), - BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), - }, nil - } - }). - Build() - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Zero(t, end, "depth==0 short-circuits with end=0") - require.Zero(t, syncStatusCalls.Load(), "SyncStatus must not be called when depth is zero") - require.Zero(t, outputCalls.Load(), "no blocks should be fetched when depth is zero") - - chain10 := h.Mock(10) - _, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.False(t, has, "logs DB must remain empty") - - // Caller sets backfillEndTimestamp; with end==0 the main loop derives the - // first unverified timestamp from the current EL finalized head. - h.interop.backfillEndTimestamp = end - requireFirstVerifiableTimestamp(t, h.interop, 111, - "with end==0 the main loop starts after the finalized head") -} - -// TestLogBackfill_NoOpWhenNoChains asserts that runLogBackfill short-circuits -// when no chains are registered: no SyncStatus/TimestampToBlockNumber calls -// can happen because there's nothing to iterate, and end must be zero so the -// main loop falls back to activationTimestamp. -func TestLogBackfill_NoOpWhenNoChains(t *testing.T) { - const act = uint64(100) - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(10 * time.Second). - // no WithChain calls — empty chains map. - Build() - require.NotNil(t, h.interop, "Interop must initialize with zero chains") - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Zero(t, end, "empty chains map short-circuits with end=0") - - h.interop.backfillEndTimestamp = end - requireFirstVerifiableTimestamp(t, h.interop, act, - "with end==0 the main loop resumes at activationTimestamp") -} - -// TestLogBackfill_ActivationInFuture asserts the edge case where the -// configured activation is ahead of every chain's EL finalized tip. -// firstVerifiableTimestamp clamps to activation, and backfill must no-op -// instead of sealing beyond the current EL finalized head. -func TestLogBackfill_ActivationInFuture(t *testing.T) { - const act = uint64(2000) - depth := 100 * time.Second - - var outputCalls atomic.Int32 - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - // EL finalized tip at 1000 — well below activation 2000. - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 1000, Time: 1000}, - SafeL2: eth.L2BlockRef{Number: 1000, Time: 1000}, - LocalSafeL2: eth.L2BlockRef{Number: 1000, Time: 1000}, - } - m.outputV0Override = func(ctx context.Context, num uint64) (*eth.OutputV0, error) { - outputCalls.Add(1) - return ð.OutputV0{ - StateRoot: eth.Bytes32(common.HexToHash("0xmockstate")), - MessagePasserStorageRoot: eth.Bytes32(common.HexToHash("0xmockmsg")), - BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), - }, nil - } - }). - Build() - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Zero(t, end) - require.Zero(t, outputCalls.Load(), - "no blocks fetched: backfill no-ops when activation is ahead of EL finalized") - - chain10 := h.Mock(10) - _, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.False(t, has, "logs DB must remain empty while backfill waits") - - require.Equal(t, act, h.interop.activationTimestamp, - "protocol activation must not change") - - requireFirstVerifiableTimestamp(t, h.interop, act, - "main loop resumes at activation when EL finalized is still pre-activation") -} - -// TestLogBackfill_ClampsStartToGenesis asserts that the per-chain start is -// clamped up to the chain's genesis timestamp. Without this clamp, -// runLogBackfill would ask TimestampToBlockNumber for a pre-genesis timestamp -// and try to seal blocks before the chain existed. Both subcases assert the -// same shape: backfill seals exactly the per-chain range [genesisBlock, endBlock]. -func TestLogBackfill_ClampsStartToGenesis(t *testing.T) { - type genesisCase struct { - name string - act uint64 - depth time.Duration - genesisTime uint64 - elFinalizedTip uint64 - // timestampToBlockNum maps a unix timestamp back to the block number the - // chain would return. nil means use the harness default (identity). - timestampToBlockNum func(ctx context.Context, ts uint64) (uint64, error) - // blockNumberToTimestamp maps a block number to its unix timestamp. Only - // block 0 (genesis) needs to differ from the identity default. - blockNumberToTimestamp func(ctx context.Context, num uint64) (uint64, error) - // blockInfoTime keeps FetchReceipts' reported block timestamp consistent - // with blockNumberToTimestamp when they diverge from identity. - blockInfoTime func(num uint64) uint64 - wantEndBlock uint64 - wantSealedBlocks int32 - } - - cases := []genesisCase{ - { - // idealStart = 110-50 = 60; startTime = max(60, act=50) = 60. - // Chain's genesis time is 100, which is > 60, so per-chain start - // clamps to genesis and seals blocks 100..110 (11 blocks). - name: "activation before genesis", - act: 50, - depth: 50 * time.Second, - genesisTime: 100, - elFinalizedTip: 110, - blockNumberToTimestamp: func(ctx context.Context, num uint64) (uint64, error) { - if num == 0 { - return 100, nil - } - return num, nil - }, - wantEndBlock: 110, - wantSealedBlocks: 11, - }, - { - // activation == genesis time. idealStart = 110-60 = 50; - // startTime = max(50, act=100) = 100. genesisTime (100) is NOT - // strictly greater than startTime (100), so the per-chain clamp - // is a no-op and chainStartTime stays at activation=100. - // TimestampToBlockNumber(100) returns block 0; the seal range is - // blocks 0..10 (11 blocks) — the genesis block at the activation - // boundary is included and has no logs, which is acceptable. - name: "activation equals genesis", - act: 100, - depth: 60 * time.Second, - genesisTime: 100, - elFinalizedTip: 110, - timestampToBlockNum: func(ctx context.Context, ts uint64) (uint64, error) { - return ts - 100, nil - }, - blockNumberToTimestamp: func(ctx context.Context, num uint64) (uint64, error) { - return num + 100, nil - }, - blockInfoTime: func(num uint64) uint64 { return num + 100 }, - wantEndBlock: 10, - wantSealedBlocks: 11, - }, - } - - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - var outputCalls atomic.Int32 - - h := newInteropTestHarness(t). - WithActivation(tc.act). - WithLogBackfillDepth(tc.depth). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: tc.elFinalizedTip, Time: tc.elFinalizedTip} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: tc.elFinalizedTip, Time: tc.elFinalizedTip}, - SafeL2: eth.L2BlockRef{Number: tc.elFinalizedTip, Time: tc.elFinalizedTip}, - LocalSafeL2: eth.L2BlockRef{Number: tc.elFinalizedTip, Time: tc.elFinalizedTip}, - } - m.blockNumberToTimestampOverride = tc.blockNumberToTimestamp - if tc.timestampToBlockNum != nil { - m.timestampToBlockNumberOverride = tc.timestampToBlockNum - } - if tc.blockInfoTime != nil { - m.blockInfoTimeFn = tc.blockInfoTime - } - m.outputV0Override = func(ctx context.Context, num uint64) (*eth.OutputV0, error) { - outputCalls.Add(1) - return ð.OutputV0{ - StateRoot: eth.Bytes32(common.HexToHash("0xmockstate")), - MessagePasserStorageRoot: eth.Bytes32(common.HexToHash("0xmockmsg")), - BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), - }, nil - } - }). - Build() - h.interop.ctx = context.Background() - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Equal(t, tc.elFinalizedTip, end, - "return value is still minELFinalizedTime regardless of the genesis clamp") - - chain10 := h.Mock(10) - latest, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, tc.wantEndBlock, latest.Number) - - require.Equal(t, tc.wantSealedBlocks, outputCalls.Load(), - "backfill must seal exactly [genesisBlock, endBlock]") - }) - } -} - -// TestLogBackfill_UsesVerifiedDBWhenInitializedAndSyncStatusStale simulates startup while -// StatusTracker still reports a stale SafeL2 block and local-safe has moved -// beyond the persisted EL finalized label. With an initialized verifiedDB, -// backfill should cap at verifiedDB.LastTimestamp instead of sampling moving -// SyncStatus state or extending past verifiedDB. -func TestLogBackfill_UsesVerifiedDBWhenInitializedAndSyncStatusStale(t *testing.T) { - const ( - act uint64 = 100 - staleCross uint64 = 100 - elFinalized uint64 = 200 - localSafe uint64 = 200 - lastVerified uint64 = 195 - backfillDepth = 60 * time.Second - ) - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(backfillDepth). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: elFinalized, Time: elFinalized} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: localSafe, Time: localSafe}, - SafeL2: eth.L2BlockRef{Number: 0, Time: staleCross}, - LocalSafeL2: eth.L2BlockRef{Number: localSafe, Time: localSafe}, - } - }). - Build() - h.interop.ctx = context.Background() - - chain10 := h.Mock(10) - for ts := act + 1; ts <= lastVerified; ts++ { - require.NoError(t, h.interop.verifiedDB.Commit(VerifiedResult{ - Timestamp: ts, - L1Inclusion: eth.BlockID{Number: 1, Hash: common.HexToHash("0xL1")}, - L2Heads: map[eth.ChainID]eth.BlockID{chain10.id: {Number: ts, Hash: common.BigToHash(new(big.Int).SetUint64(ts))}}, - })) - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - h.interop.backfillEndTimestamp = end - - require.Equal(t, lastVerified, end, - "backfill must derive endTime from verifiedDB.LastTimestamp when verifiedDB is initialized") - - latest, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, lastVerified, latest.Number) -} - -// verifiedDB.LastTimestamp typically exceeds min EL finalized; backfill must -// still resume from verifiedDB. -func TestLogBackfill_UsesVerifiedDBWhenAheadOfELFinalized(t *testing.T) { - const ( - act uint64 = 100 - elFinalized uint64 = 190 - lastVerified uint64 = 195 - ) - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(60*time.Second). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: elFinalized, Time: elFinalized} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - SafeL2: eth.L2BlockRef{Number: 0, Time: act}, - LocalSafeL2: eth.L2BlockRef{Number: 200, Time: 200}, - } - }). - Build() - h.interop.ctx = context.Background() - - chain10 := h.Mock(10) - for ts := act + 1; ts <= lastVerified; ts++ { - require.NoError(t, h.interop.verifiedDB.Commit(VerifiedResult{ - Timestamp: ts, - L1Inclusion: eth.BlockID{Number: 1, Hash: common.HexToHash("0xL1")}, - L2Heads: map[eth.ChainID]eth.BlockID{chain10.id: {Number: ts, Hash: common.BigToHash(new(big.Int).SetUint64(ts))}}, - })) - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Equal(t, lastVerified, end, - "backfill end derives from verifiedDB.LastTimestamp regardless of EL finalized") - - latest, has := h.interop.logsDBs[chain10.id].LatestSealedBlock() - require.True(t, has) - require.Equal(t, lastVerified, latest.Number) -} - -// TestLogBackfill_LeavesAheadLogsDBUnchanged asserts that when a chain's -// logsDB already holds canonical blocks past the computed backfill endTime, -// runLogBackfill does not rewrite, trim, or extend it. reconcileLogsDBTail -// sees the tip hash match canonical and returns without touching state; the -// seal loop then no-ops because startNum (latest+1) > endNum. -// -// Setup: cold start (empty verifiedDB), act=100, depth=60s, -// EL finalized=110 → endTime=110. Pre-seal blocks 100..120 with canonical -// hashes for chain 10. After backfill the logsDB tip must still be 120. -func TestLogBackfill_LeavesAheadLogsDBUnchanged(t *testing.T) { - const ( - act uint64 = 100 - elFinalized uint64 = 110 - preSeedTip uint64 = 120 - ) - depth := 60 * time.Second - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: elFinalized, Time: elFinalized} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - SafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - LocalSafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - } - }). - Build() - h.interop.ctx = context.Background() - - chain10 := h.Mock(10) - db := h.interop.logsDBs[chain10.id] - canonicalHash := func(n uint64) common.Hash { - return common.BigToHash(new(big.Int).SetUint64(n)) - } - require.NoError(t, db.SealBlock(common.Hash{}, - eth.BlockID{Number: act, Hash: canonicalHash(act)}, act)) - for n := act + 1; n <= preSeedTip; n++ { - require.NoError(t, db.SealBlock(canonicalHash(n-1), - eth.BlockID{Number: n, Hash: canonicalHash(n)}, n)) - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Equal(t, elFinalized, end, - "backfill end must equal minELFinalizedTime, independent of the ahead-of-end logsDB tip") - - latest, has := db.LatestSealedBlock() - require.True(t, has) - require.Equal(t, preSeedTip, latest.Number, - "logsDB must be left untouched when it is already past endTime and canonical") - require.Equal(t, canonicalHash(preSeedTip), latest.Hash, - "logsDB tip hash must be unchanged") -} - -// TestLogBackfill_TrimsNonCanonicalAheadLogsDBAndCatchesUp asserts the -// reconcile-then-backfill behavior when a chain's logsDB sits ahead of -// endTime but its tail has diverged from canonical (e.g. an L2 reorg landed -// while supernode was offline). reconcileLogsDBTail must walk back to the -// last canonical block, after which backfill seals forward up to endTime. -// -// Setup: cold start, act=100, depth=60s, EL finalized=110 → endTime=110. -// Pre-seal blocks 100..108 with canonical hashes, then 109..120 with a v1 -// fork hash. Expect reconcile to rewind to 108, and the seal loop to seal -// 109..110 with canonical hashes. Final tip is endTime (110). -func TestLogBackfill_TrimsNonCanonicalAheadLogsDBAndCatchesUp(t *testing.T) { - const ( - act uint64 = 100 - elFinalized uint64 = 110 - lastCanonNum uint64 = 108 - preSeedTip uint64 = 120 - ) - depth := 60 * time.Second - - h := newInteropTestHarness(t). - WithActivation(act). - WithLogBackfillDepth(depth). - WithChain(10, func(m *mockChainContainer) { - m.elFinalizedHead = eth.L2BlockRef{Number: elFinalized, Time: elFinalized} - m.elFinalizedHeadSet = true - m.syncStatusFull = ð.SyncStatus{ - CurrentL1: eth.L1BlockRef{Number: 1, Hash: common.HexToHash("0xL1")}, - UnsafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - SafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - LocalSafeL2: eth.L2BlockRef{Number: preSeedTip, Time: preSeedTip}, - } - }). - Build() - h.interop.ctx = context.Background() - - chain10 := h.Mock(10) - db := h.interop.logsDBs[chain10.id] - canonicalHash := func(n uint64) common.Hash { - return common.BigToHash(new(big.Int).SetUint64(n)) - } - v1Hash := func(n uint64) common.Hash { - return common.BigToHash(new(big.Int).SetUint64(n | 0xdead0000)) - } - - require.NoError(t, db.SealBlock(common.Hash{}, - eth.BlockID{Number: act, Hash: canonicalHash(act)}, act)) - for n := act + 1; n <= lastCanonNum; n++ { - require.NoError(t, db.SealBlock(canonicalHash(n-1), - eth.BlockID{Number: n, Hash: canonicalHash(n)}, n)) - } - require.NoError(t, db.SealBlock(canonicalHash(lastCanonNum), - eth.BlockID{Number: lastCanonNum + 1, Hash: v1Hash(lastCanonNum + 1)}, lastCanonNum+1)) - for n := lastCanonNum + 2; n <= preSeedTip; n++ { - require.NoError(t, db.SealBlock(v1Hash(n-1), - eth.BlockID{Number: n, Hash: v1Hash(n)}, n)) - } - - end, err := h.interop.runLogBackfill() - require.NoError(t, err) - require.Equal(t, elFinalized, end, - "backfill end must equal minELFinalizedTime, independent of the ahead-of-end logsDB tip") - - latest, has := db.LatestSealedBlock() - require.True(t, has) - require.Equal(t, elFinalized, latest.Number, - "after reconcile + seal, logsDB tip must equal endTime") - require.Equal(t, canonicalHash(elFinalized), latest.Hash, - "final tip must hold the canonical hash, not a stale v1 hash") - - for n := act; n <= elFinalized; n++ { - seal, err := db.FindSealedBlock(n) - require.NoError(t, err, "block %d must remain sealed", n) - require.Equal(t, canonicalHash(n), seal.Hash, - "block %d must hold the canonical hash after reconcile", n) - } -} diff --git a/op-supernode/supernode/activity/interop/logdb.go b/op-supernode/supernode/activity/interop/logdb.go index 0d7bc122a8b..9541eee6bad 100644 --- a/op-supernode/supernode/activity/interop/logdb.go +++ b/op-supernode/supernode/activity/interop/logdb.go @@ -152,7 +152,7 @@ func (i *Interop) verifyCanAddTimestamp(chainID eth.ChainID, db LogsDB, ts uint6 if !hasBlocks { // The main loop starts at firstVerifiableTimestamp. If the DB is empty, // this is the only timestamp the main loop would legitimately seal first. - firstVerifiable, err := i.firstVerifiableTimestamp(i.ctx) + firstVerifiable, err := i.firstVerifiableTimestamp() if err != nil { return eth.BlockID{}, hasBlocks, err } diff --git a/op-supernode/supernode/activity/interop/logdb_test.go b/op-supernode/supernode/activity/interop/logdb_test.go index 5d06f9fd4cd..9241bb2c047 100644 --- a/op-supernode/supernode/activity/interop/logdb_test.go +++ b/op-supernode/supernode/activity/interop/logdb_test.go @@ -221,9 +221,11 @@ func TestVerifyPreviousTimestampSealed(t *testing.T) { t.Parallel() interop := &Interop{ - log: gethlog.New(), - activationTimestamp: tt.activationTS, + log: gethlog.New(), + activationTimestamp: tt.activationTS, + verificationStartTimestamp: tt.activationTS, } + interop.initialized.Store(true) chainID := eth.ChainIDFromUInt64(10) expectedHash := common.Hash{0x01} db := &mockLogsDB{ diff --git a/op-supernode/supernode/activity/interop/startup_test.go b/op-supernode/supernode/activity/interop/startup_test.go new file mode 100644 index 00000000000..98b0abb4acd --- /dev/null +++ b/op-supernode/supernode/activity/interop/startup_test.go @@ -0,0 +1,355 @@ +package interop + +import ( + "context" + "errors" + "math/big" + "sync/atomic" + "testing" + "time" + + "github.com/ethereum-optimism/optimism/op-service/eth" + cc "github.com/ethereum-optimism/optimism/op-supernode/supernode/chain_container" + "github.com/ethereum/go-ethereum/common" + "github.com/stretchr/testify/require" +) + +// TestFastInit_ResumesFromVerifiedDB asserts a node with any committed entry +// resumes at LastTimestamp+1 without consulting SafeDB or wall-clock. +func TestFastInit_ResumesFromVerifiedDB(t *testing.T) { + + dataDir := t.TempDir() + db, err := OpenVerifiedDB(dataDir) + require.NoError(t, err) + require.NoError(t, db.Commit(VerifiedResult{ + Timestamp: 500, + L1Inclusion: eth.BlockID{Number: 1}, + L2Heads: map[eth.ChainID]eth.BlockID{eth.ChainIDFromUInt64(10): {Number: 50}}, + })) + require.NoError(t, db.Close()) + + interop := New(testLogger(), 100, 0, nil, dataDir, nil, 0, nil) + require.NotNil(t, interop) + defer func() { require.NoError(t, interop.Stop(context.Background())) }() + + interop.fastInit() + require.True(t, interop.initialized.Load()) + require.False(t, interop.waitingForSync) + require.Equal(t, uint64(501), interop.verificationStartTimestamp) +} + +// TestFastInit_ResumeBelowActivationIsAllowed exercises the property that a +// pre-activation resume timestamp is valid: verification iterates harmlessly +// over rounds where no executing messages exist, and verifiedDB stays +// gap-free. +func TestFastInit_ResumeBelowActivationIsAllowed(t *testing.T) { + + dataDir := t.TempDir() + db, err := OpenVerifiedDB(dataDir) + require.NoError(t, err) + require.NoError(t, db.Commit(VerifiedResult{ + Timestamp: 50, + L1Inclusion: eth.BlockID{Number: 1}, + L2Heads: map[eth.ChainID]eth.BlockID{eth.ChainIDFromUInt64(10): {Number: 5}}, + })) + require.NoError(t, db.Close()) + + interop := New(testLogger(), 1000, 0, nil, dataDir, nil, 0, nil) + require.NotNil(t, interop) + defer func() { require.NoError(t, interop.Stop(context.Background())) }() + + interop.fastInit() + require.True(t, interop.initialized.Load()) + require.Equal(t, uint64(51), interop.verificationStartTimestamp, + "resume always uses LastTimestamp+1, never clamps to activation") +} + +// TestFastInit_ColdStartDefersToLoop confirms that with no verifiedDB entry +// fastInit sets waitingForSync without touching SafeDB or wall-clock. +func TestFastInit_ColdStartDefersToLoop(t *testing.T) { + + dataDir := t.TempDir() + + interop := New(testLogger(), 1000, 0, nil, dataDir, nil, 0, nil) + require.NotNil(t, interop) + defer func() { require.NoError(t, interop.Stop(context.Background())) }() + + interop.fastInit() + require.False(t, interop.initialized.Load()) + require.True(t, interop.waitingForSync) + require.Zero(t, interop.verificationStartTimestamp) +} + +// TestAdvanceColdStartInit_WaitsWhenAnyChainEmpty exercises the per-iteration +// gate: if any chain has no SafeDB entries yet, advanceColdStartInit returns +// (false, nil) so the loop backs off. +func TestAdvanceColdStartInit_WaitsWhenAnyChainEmpty(t *testing.T) { + + h := newInteropTestHarness(t). + WithActivation(1000). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 1234 + m.firstSafeHeadTimestampSet = true + }). + WithChain(20, func(m *mockChainContainer) { + // Default: returns ErrSafeDBEmpty. + }). + Build() + // Harness pre-sets initialized=true for tests that drive the verify + // path; we're exercising cold start, so reset. + h.interop.initialized.Store(false) + h.interop.verificationStartTimestamp = 0 + + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.False(t, advanced, "must wait when any chain reports ErrSafeDBEmpty") + require.False(t, h.interop.initialized.Load()) +} + +// TestAdvanceColdStartInit_PicksMaxClampedToActivation: with all chains +// reporting first SafeDB entries, verificationStartTimestamp is the max of +// (activation, T_c). +func TestAdvanceColdStartInit_PicksMaxClampedToActivation(t *testing.T) { + + t.Run("activation higher than chain timestamps", func(t *testing.T) { + h := newInteropTestHarness(t). + WithActivation(5000). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 100 + m.firstSafeHeadTimestampSet = true + }). + WithChain(20, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 200 + m.firstSafeHeadTimestampSet = true + }). + Build() + // logBackfillDepth=0 so backfill is a no-op. + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.True(t, advanced) + require.Equal(t, uint64(5000), h.interop.verificationStartTimestamp) + }) + + t.Run("max chain timestamp higher than activation", func(t *testing.T) { + h := newInteropTestHarness(t). + WithActivation(1000). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 1500 + m.firstSafeHeadTimestampSet = true + }). + WithChain(20, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 1750 + m.firstSafeHeadTimestampSet = true + }). + Build() + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.True(t, advanced) + require.Equal(t, uint64(1750), h.interop.verificationStartTimestamp) + }) +} + +// TestAdvanceColdStartInit_PropagatesNonEmptyErrors confirms that +// FirstSafeHeadTimestamp errors other than ErrSafeDBEmpty are fatal. +func TestAdvanceColdStartInit_PropagatesNonEmptyErrors(t *testing.T) { + + fault := errors.New("vn not running") + h := newInteropTestHarness(t). + WithActivation(1000). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestampErr = fault + }). + Build() + + advanced, err := h.interop.advanceColdStartInit() + require.Error(t, err) + require.ErrorIs(t, err, fault) + require.False(t, advanced) +} + +// TestRunLoop_ColdStartTransition drives the loop from waitingForSync to +// initialized via a SafeDB entry appearing after a few iterations. +func TestRunLoop_ColdStartTransition(t *testing.T) { + + h := newInteropTestHarness(t). + WithActivation(1000). + WithChain(10, func(m *mockChainContainer) { + m.blockAtTimestamp = eth.L2BlockRef{Number: 100, Hash: common.HexToHash("0x1")} + }). + Build() + // Reset the harness-faked initialization so we can drive cold start. + h.interop.initialized.Store(false) + h.interop.verificationStartTimestamp = 0 + h.interop.waitingForSync = true + + mock := h.Mock(10) + + // First two iterations: SafeDB empty. Third: populated. + var iterCount atomic.Int32 + go func() { + // Background flipper: after a short delay, populate the chain's + // first safe head timestamp so advanceColdStartInit can complete. + time.Sleep(20 * time.Millisecond) + mock.mu.Lock() + mock.firstSafeHeadTimestamp = 1500 + mock.firstSafeHeadTimestampSet = true + mock.mu.Unlock() + }() + + // Loop a few times waiting for transition. + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + h.interop.ctx = ctx + for !h.interop.initialized.Load() { + select { + case <-ctx.Done(): + t.Fatal("cold start did not complete") + default: + } + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + if advanced { + h.interop.initialized.Store(true) + h.interop.waitingForSync = false + } else { + time.Sleep(10 * time.Millisecond) + } + iterCount.Add(1) + } + require.True(t, h.interop.initialized.Load()) + require.Equal(t, uint64(1500), h.interop.verificationStartTimestamp) + require.GreaterOrEqual(t, iterCount.Load(), int32(2), + "should have backed off at least once before SafeDB was populated") +} + +// TestColdStartBackfill_NoOpWhenDepthZero confirms backfill is skipped when +// the operator disables it. +func TestColdStartBackfill_NoOpWhenDepthZero(t *testing.T) { + + h := newInteropTestHarness(t). + WithActivation(1000). + WithLogBackfillDepth(0). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 1500 + m.firstSafeHeadTimestampSet = true + }). + Build() + + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.True(t, advanced) + require.Equal(t, uint64(1500), h.interop.verificationStartTimestamp) + + // logsDB must be empty: no backfill ran. + _, has := h.interop.logsDBs[eth.ChainIDFromUInt64(10)].LatestSealedBlock() + require.False(t, has, "no blocks should be sealed when logBackfillDepth=0") +} + +// TestColdStartBackfill_NoOpWhenNoChains exercises the empty-chains short +// circuit so advanceColdStartInit completes against an empty depset. +func TestColdStartBackfill_NoOpWhenNoChains(t *testing.T) { + + h := newInteropTestHarness(t). + WithActivation(1000). + Build() + require.Empty(t, h.interop.chains) + + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.True(t, advanced, "empty depset means advance immediately") + require.Equal(t, uint64(1000), h.interop.verificationStartTimestamp) +} + +// TestColdStartBackfill_GenesisClamp exercises the per-chain genesis clamp. +// activationTimestamp=0, depth=1000s, verificationStart=2000 would naively +// yield start=1000; but the chain's genesis time is 1500, so backfill must +// not fetch any block whose timestamp falls below genesis. +func TestColdStartBackfill_GenesisClamp(t *testing.T) { + + depth := 1000 * time.Second + var minFetched atomic.Uint64 + minFetched.Store(^uint64(0)) + h := newInteropTestHarness(t). + WithActivation(0). + WithLogBackfillDepth(depth). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 2000 + m.firstSafeHeadTimestampSet = true + m.blockNumberToTimestampOverride = func(_ context.Context, n uint64) (uint64, error) { + if n == 0 { + return 1500, nil + } + return n, nil + } + m.timestampToBlockNumberOverride = func(_ context.Context, ts uint64) (uint64, error) { + return ts, nil + } + m.outputV0Override = func(_ context.Context, num uint64) (*eth.OutputV0, error) { + for { + prev := minFetched.Load() + if num >= prev || minFetched.CompareAndSwap(prev, num) { + break + } + } + return ð.OutputV0{ + BlockHash: common.BigToHash(new(big.Int).SetUint64(num)), + }, nil + } + }). + Build() + h.interop.initialized.Store(false) + h.interop.verificationStartTimestamp = 0 + + advanced, err := h.interop.advanceColdStartInit() + require.NoError(t, err) + require.True(t, advanced) + require.Equal(t, uint64(2000), h.interop.verificationStartTimestamp) + + require.GreaterOrEqual(t, minFetched.Load(), uint64(1500), + "backfill must not fetch blocks before genesis") +} + +// TestFirstVerifiableTimestamp_PrefersVerifiedDB locks the contract that +// verifiedDB.FirstTimestamp takes precedence over any later +// verificationStartTimestamp set by init. +func TestFirstVerifiableTimestamp_PrefersVerifiedDB(t *testing.T) { + + dataDir := t.TempDir() + db, err := OpenVerifiedDB(dataDir) + require.NoError(t, err) + require.NoError(t, db.Commit(VerifiedResult{ + Timestamp: 200, + L1Inclusion: eth.BlockID{Number: 1}, + L2Heads: map[eth.ChainID]eth.BlockID{eth.ChainIDFromUInt64(10): {Number: 20}}, + })) + require.NoError(t, db.Close()) + + interop := New(testLogger(), 100, 0, nil, dataDir, nil, 0, nil) + require.NotNil(t, interop) + defer func() { require.NoError(t, interop.Stop(context.Background())) }() + + // Resume picks verificationStart=201, but RPC accessor returns 200 + // (the first committed timestamp) for the firstVerifiable boundary. + interop.fastInit() + require.Equal(t, uint64(201), interop.verificationStartTimestamp) + + got, err := interop.firstVerifiableTimestamp() + require.NoError(t, err) + require.Equal(t, uint64(200), got) +} + +// TestFirstVerifiableTimestamp_ErrNotStartedBeforeInit confirms RPC accessors +// return ErrNotStarted while cold-start init is in progress. +func TestFirstVerifiableTimestamp_ErrNotStartedBeforeInit(t *testing.T) { + + dataDir := t.TempDir() + interop := New(testLogger(), 1000, 0, nil, dataDir, nil, 0, nil) + require.NotNil(t, interop) + defer func() { require.NoError(t, interop.Stop(context.Background())) }() + + _, err := interop.firstVerifiableTimestamp() + require.ErrorIs(t, err, ErrNotStarted) +} + +// _ ensures the cc import is retained even if helpers shift. +var _ = cc.ErrSafeDBEmpty diff --git a/op-supernode/supernode/activity/supernode/supernode_test.go b/op-supernode/supernode/activity/supernode/supernode_test.go index c96a7605a9a..6b7b50c8c2a 100644 --- a/op-supernode/supernode/activity/supernode/supernode_test.go +++ b/op-supernode/supernode/activity/supernode/supernode_test.go @@ -114,6 +114,10 @@ func (m *mockCC) BlockNumberToTimestamp(ctx context.Context, blocknum uint64) (u return 0, nil } +func (m *mockCC) FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) { + return 0, cc.ErrSafeDBEmpty +} + var _ cc.ChainContainer = (*mockCC)(nil) func TestSupernode_SyncStatus_Succeeds(t *testing.T) { diff --git a/op-supernode/supernode/activity/superroot/superroot_test.go b/op-supernode/supernode/activity/superroot/superroot_test.go index b8f9cf6f2b3..e64ac9f0391 100644 --- a/op-supernode/supernode/activity/superroot/superroot_test.go +++ b/op-supernode/supernode/activity/superroot/superroot_test.go @@ -109,6 +109,9 @@ func (m *mockCC) TimestampToBlockNumber(ctx context.Context, ts uint64) (uint64, func (m *mockCC) BlockNumberToTimestamp(ctx context.Context, blocknum uint64) (uint64, error) { return 0, nil } +func (m *mockCC) FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) { + return 0, cc.ErrSafeDBEmpty +} func (m *mockCC) Generation() uint64 { return 0 } var _ cc.ChainContainer = (*mockCC)(nil) diff --git a/op-supernode/supernode/chain_container/chain_container.go b/op-supernode/supernode/chain_container/chain_container.go index 867de236129..dc8aed72e8d 100644 --- a/op-supernode/supernode/chain_container/chain_container.go +++ b/op-supernode/supernode/chain_container/chain_container.go @@ -12,6 +12,7 @@ import ( opnodecfg "github.com/ethereum-optimism/optimism/op-node/config" rollupNode "github.com/ethereum-optimism/optimism/op-node/node" + "github.com/ethereum-optimism/optimism/op-node/node/safedb" "github.com/ethereum-optimism/optimism/op-node/rollup" "github.com/ethereum-optimism/optimism/op-service/client" "github.com/ethereum-optimism/optimism/op-service/eth" @@ -37,6 +38,11 @@ const virtualNodeVersion = "0.1.0" // retrying; recovery requires operator intervention. var ErrHistoryUnavailable = errors.New("safedb history unavailable on this node") +// ErrSafeDBEmpty is returned by FirstSafeHeadTimestamp when SafeDB has no +// entries yet. This is a transient condition during cold start while the VN +// derives its first safe head; callers should back off and retry. +var ErrSafeDBEmpty = errors.New("safedb has no entries yet") + type ChainContainer interface { Start(ctx context.Context) error Stop(ctx context.Context) error @@ -49,6 +55,10 @@ type ChainContainer interface { // TimestampToBlockNumber maps an L2 unix timestamp to the L2 block number (rollup derivation). TimestampToBlockNumber(ctx context.Context, ts uint64) (uint64, error) BlockNumberToTimestamp(ctx context.Context, blocknum uint64) (uint64, error) + // FirstSafeHeadTimestamp returns the L2 block timestamp of the first + // entry in this chain's SafeDB. Returns ErrSafeDBEmpty when the chain + // has not yet derived a safe head. + FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) OptimisticAt(ctx context.Context, ts uint64) (l2, l1 eth.BlockID, err error) // OutputRootAtL2BlockHash returns the L2 output root for the canonical @@ -432,6 +442,21 @@ func (c *simpleChainContainer) BlockNumberToTimestamp(ctx context.Context, block return c.vncfg.Rollup.TimestampForBlock(blocknum), nil } +func (c *simpleChainContainer) FirstSafeHeadTimestamp(ctx context.Context) (uint64, error) { + vn := c.getVN() + if vn == nil { + return 0, virtual_node.ErrVirtualNodeNotRunning + } + _, l2, err := vn.FirstSafeHeadEntry(ctx) + if err != nil { + if errors.Is(err, safedb.ErrNotFound) { + return 0, ErrSafeDBEmpty + } + return 0, fmt.Errorf("first safedb entry: %w", err) + } + return c.BlockNumberToTimestamp(ctx, l2.Number) +} + // LocalSafeBlockAtTimestamp returns the highest L2 block with timestamp <= ts using the L2 client, // if the block at that timestamp is local safe. func (c *simpleChainContainer) LocalSafeBlockAtTimestamp(ctx context.Context, ts uint64) (eth.L2BlockRef, error) { diff --git a/op-supernode/supernode/chain_container/chain_container_test.go b/op-supernode/supernode/chain_container/chain_container_test.go index 6c08ec57894..2fa145204c7 100644 --- a/op-supernode/supernode/chain_container/chain_container_test.go +++ b/op-supernode/supernode/chain_container/chain_container_test.go @@ -111,6 +111,11 @@ func (m *mockVirtualNode) L1AtSafeHead(ctx context.Context, target eth.BlockID) return m.safeHeadL1, m.safeHeadErr } +// FirstSafeHeadEntry implements virtual_node.VirtualNode FirstSafeHeadEntry +func (m *mockVirtualNode) FirstSafeHeadEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) { + return m.safeHeadL1, m.safeHeadL2, m.safeHeadErr +} + // LastL1 implements virtual_node.VirtualNode LastL1 func (m *mockVirtualNode) LastL1(ctx context.Context) (eth.BlockID, error) { return m.safeHeadL1, m.safeHeadErr @@ -1050,6 +1055,9 @@ func (m *mockVNForL1AtSafeHeadError) SafeHeadAtL1(ctx context.Context, l1BlockNu func (m *mockVNForL1AtSafeHeadError) L1AtSafeHead(ctx context.Context, target eth.BlockID) (eth.BlockID, error) { return eth.BlockID{}, m.l1AtSafeHeadErr } +func (m *mockVNForL1AtSafeHeadError) FirstSafeHeadEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) { + return eth.BlockID{}, eth.BlockID{}, nil +} func (m *mockVNForL1AtSafeHeadError) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) { return m.syncStatusResult, nil } diff --git a/op-supernode/supernode/chain_container/invalidation_test.go b/op-supernode/supernode/chain_container/invalidation_test.go index bec5e039d3e..dde3a26f894 100644 --- a/op-supernode/supernode/chain_container/invalidation_test.go +++ b/op-supernode/supernode/chain_container/invalidation_test.go @@ -347,6 +347,9 @@ func (m *mockVNForInvalidation) SafeHeadAtL1(ctx context.Context, l1BlockNum uin func (m *mockVNForInvalidation) L1AtSafeHead(ctx context.Context, target eth.BlockID) (eth.BlockID, error) { return eth.BlockID{}, nil } +func (m *mockVNForInvalidation) FirstSafeHeadEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) { + return eth.BlockID{}, eth.BlockID{}, nil +} func (m *mockVNForInvalidation) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) { return ð.SyncStatus{}, nil } diff --git a/op-supernode/supernode/chain_container/virtual_node/virtual_node.go b/op-supernode/supernode/chain_container/virtual_node/virtual_node.go index 042bb1d0b06..9d9a0c93c18 100644 --- a/op-supernode/supernode/chain_container/virtual_node/virtual_node.go +++ b/op-supernode/supernode/chain_container/virtual_node/virtual_node.go @@ -43,6 +43,9 @@ type VirtualNode interface { SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) (eth.BlockID, eth.BlockID, error) // L1AtSafeHead returns the earliest L1 block at which the given L2 block became safe. L1AtSafeHead(ctx context.Context, target eth.BlockID) (eth.BlockID, error) + // FirstSafeHeadEntry returns the lowest recorded (L1, L2 safe head) pair from SafeDB. + // Returns safedb.ErrNotFound when SafeDB has no entries yet. + FirstSafeHeadEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) } @@ -212,6 +215,20 @@ func (v *simpleVirtualNode) SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) return db.SafeHeadAtL1(ctx, l1BlockNum) } +func (v *simpleVirtualNode) FirstSafeHeadEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) { + v.mu.Lock() + inner := v.inner + v.mu.Unlock() + if inner == nil { + return eth.BlockID{}, eth.BlockID{}, ErrVirtualNodeNotRunning + } + db := inner.SafeDB() + if db == nil { + return eth.BlockID{}, eth.BlockID{}, ErrVirtualNodeNotRunning + } + return db.FirstEntry(ctx) +} + // ErrL1AtSafeHeadNotFound: transient — SafeDB hasn't observed the answer yet // (target ahead of latest, or DB empty at startup). Retry. var ErrL1AtSafeHeadNotFound = errors.New("l1 at safe head not found") diff --git a/op-supernode/supernode/chain_container/virtual_node/virtual_node_test.go b/op-supernode/supernode/chain_container/virtual_node/virtual_node_test.go index 900e350fbe8..cc727570d71 100644 --- a/op-supernode/supernode/chain_container/virtual_node/virtual_node_test.go +++ b/op-supernode/supernode/chain_container/virtual_node/virtual_node_test.go @@ -111,6 +111,22 @@ func (m *mockSafeDBReader) SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) return entry.l1, entry.l2, nil } +func (m *mockSafeDBReader) FirstEntry(ctx context.Context) (eth.BlockID, eth.BlockID, error) { + if len(m.entries) == 0 { + return eth.BlockID{}, eth.BlockID{}, safedb.ErrNotFound + } + var lowest uint64 + first := true + for num := range m.entries { + if first || num < lowest { + lowest = num + first = false + } + } + entry := m.entries[lowest] + return entry.l1, entry.l2, nil +} + // Test helpers func createTestConfig() *opnodecfg.Config { return &opnodecfg.Config{ From a46698fa6fdc6cd2b89af73b1ce4988faa039a55 Mon Sep 17 00:00:00 2001 From: Adrian Sutton Date: Mon, 18 May 2026 09:12:45 +1000 Subject: [PATCH 2/5] fix(op-devstack/dsl): drop removed BackfillEndTimestamp accessor FirstVerifiableTimestamp() is now the authoritative handoff marker; the BackfillEndTimestamp fallback was tied to the pre-rework startup path and the accessor no longer exists on Interop. --- op-devstack/dsl/supernode.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/op-devstack/dsl/supernode.go b/op-devstack/dsl/supernode.go index 4668aa854f7..6dd2aa61cd2 100644 --- a/op-devstack/dsl/supernode.go +++ b/op-devstack/dsl/supernode.go @@ -187,7 +187,7 @@ func (s *Supernode) AwaitBackfillCompleted() { // (the first seal is at most one block before activation; when activation // is not aligned to a block boundary, the block representing the chain // state as of activation is the correct pairing anchor and is sealed). -// 2. firstSealed.Timestamp < BackfillEndTimestamp()+1 +// 2. firstSealed.Timestamp < FirstVerifiableTimestamp() // (the post-backfill handoff happens strictly after the backfilled range) // 3. firstSealed.Timestamp <= max(ActivationTimestamp, latestSealed.Timestamp - depth) // + blockTime (backfill reached ~depth back, @@ -202,9 +202,6 @@ func (s *Supernode) AssertBackfillCovers(depth time.Duration, blockTime uint64, activation := ia.ActivationTimestamp() backfillHandoff := ia.FirstVerifiableTimestamp() - if backfillEnd := ia.BackfillEndTimestamp(); backfillEnd != 0 { - backfillHandoff = backfillEnd + 1 - } depthSec := uint64(depth / time.Second) for _, chainID := range chains { From 0715bbc030711c491cd77c95d0e14858927331b9 Mon Sep 17 00:00:00 2001 From: Adrian Sutton Date: Mon, 18 May 2026 10:12:09 +1000 Subject: [PATCH 3/5] refactor(op-supernode/interop): clarify startup loop and tolerate transient cold-start errors - Rename fastInit to tryInitFromVerifiedDB. - Split runLoop so each iteration runs exactly one of waitForColdStartInit or progress and the loop is the only place that sleeps; each step returns the duration it wants to wait. - Inject clock.Clock so tests can drive time deterministically; replace the remaining time.Sleep/time.Now/time.Since calls. - Treat all advanceColdStartInit errors as retryable (logged + errorBackoff). Cold start races chain-container startup, so transient signals like virtual-node-not-running must not kill the activity; cold start has no ErrHistoryUnavailable path to handle. --- .../supernode/activity/interop/interop.go | 142 +++++++++++------- .../activity/interop/startup_test.go | 10 +- 2 files changed, 92 insertions(+), 60 deletions(-) diff --git a/op-supernode/supernode/activity/interop/interop.go b/op-supernode/supernode/activity/interop/interop.go index 3ada95e3981..47b48413008 100644 --- a/op-supernode/supernode/activity/interop/interop.go +++ b/op-supernode/supernode/activity/interop/interop.go @@ -12,6 +12,7 @@ import ( "github.com/ethereum-optimism/optimism/op-node/params" opservice "github.com/ethereum-optimism/optimism/op-service" + "github.com/ethereum-optimism/optimism/op-service/clock" "github.com/ethereum-optimism/optimism/op-service/eth" "github.com/ethereum-optimism/optimism/op-supernode/flags" "github.com/ethereum-optimism/optimism/op-supernode/supernode/activity" @@ -144,17 +145,17 @@ type Interop struct { activationTimestamp uint64 // immutable protocol activation timestamp // verificationStartTimestamp is the first L2 timestamp the main loop - // attempts to verify. Set exactly once during fastInit (resume or - // future-activation paths) or by advanceColdStartInit, then immutable. + // attempts to verify. Set exactly once during tryInitFromVerifiedDB + // (resume path) or by advanceColdStartInit, then immutable. verificationStartTimestamp uint64 // initialized is set true once verificationStartTimestamp has been // chosen. RPC accessors return ErrNotStarted while false. initialized atomic.Bool - // waitingForSync is true between fastInit deferring cold-start origin - // selection and the loop iteration that completes it. Only read/written - // by the main loop goroutine; no mutex needed. + // waitingForSync is true between tryInitFromVerifiedDB deferring + // cold-start origin selection and the loop iteration that completes it. + // Only read/written by the main loop goroutine; no mutex needed. waitingForSync bool dataDir string @@ -198,6 +199,10 @@ type Interop struct { logBackfillDepth time.Duration metrics *resources.SupernodeMetrics + + // clock is used for all wall-clock reads and sleeps so deterministic + // tests can inject a fake. Defaults to clock.SystemClock in New. + clock clock.Clock } func (i *Interop) Name() string { @@ -270,6 +275,7 @@ func New( messageExpiryWindow: messageExpiryWindow, logBackfillDepth: logBackfillDepth, metrics: metrics, + clock: clock.SystemClock, } // default to using the verifyInteropMessages function // (can be overridden by tests) @@ -291,16 +297,16 @@ func (i *Interop) Start(ctx context.Context) error { i.started = true i.mu.Unlock() - i.fastInit() + i.tryInitFromVerifiedDB() return i.runLoop() } -// fastInit selects verificationStartTimestamp from verifiedDB if any commit -// exists. Otherwise it defers to the cold-start loop, which waits for every -// chain to record a first SafeDB entry before picking an origin. Wall-clock -// time is not consulted: chain derivation progress is the only authoritative -// signal for "where we are" relative to activation. -func (i *Interop) fastInit() { +// tryInitFromVerifiedDB selects verificationStartTimestamp from verifiedDB if +// any commit exists. Otherwise it defers to the cold-start loop, which waits +// for every chain to record a first SafeDB entry before picking an origin. +// Wall-clock time is not consulted: chain derivation progress is the only +// authoritative signal for "where we are" relative to activation. +func (i *Interop) tryInitFromVerifiedDB() { if lastTS, ok := i.verifiedDB.LastTimestamp(); ok { i.verificationStartTimestamp = lastTS + 1 i.initialized.Store(true) @@ -314,56 +320,82 @@ func (i *Interop) fastInit() { "activationTimestamp", i.activationTimestamp) } -// runLoop drives initialization and verification. When waitingForSync is -// true the loop calls advanceColdStartInit each iteration until the cold -// start completes; otherwise it calls progressAndRecord. +// runLoop drives initialization and verification. Each iteration performs +// exactly one of two actions and then sleeps for the duration the action +// chose: waitForColdStartInit while cold-start initialization is in +// progress, otherwise progress to verify the next round. func (i *Interop) runLoop() error { for { - select { - case <-i.ctx.Done(): - return i.ctx.Err() - default: - } - + var ( + sleep time.Duration + err error + ) if i.waitingForSync { - advanced, err := i.advanceColdStartInit() - if err != nil { - i.metrics.ActivityErrors.WithLabelValues("interop", "cold_start_init").Inc() - i.log.Error("interop cold start failed", "err", err) - return fmt.Errorf("interop cold start init: %w", err) - } - if !advanced { - select { - case <-i.ctx.Done(): - return i.ctx.Err() - case <-time.After(backoffPeriod): - } - continue - } - i.waitingForSync = false - i.initialized.Store(true) - i.log.Info("interop cold start complete", - "activationTimestamp", i.activationTimestamp, - "verificationStartTimestamp", i.verificationStartTimestamp) + sleep, err = i.waitForColdStartInit() + } else { + sleep, err = i.progress() } - - madeProgress, err := i.progressAndRecord() if err != nil { - if errors.Is(err, cc.ErrHistoryUnavailable) { - i.metrics.ActivityErrors.WithLabelValues("interop", "history_unavailable").Inc() - i.log.Error("interop activity halted: SafeDB history unavailable on this node", "err", err, - "remediation", "reseed data dir, advance interop.activation-timestamp past the gap, or rederive from L1") - return fmt.Errorf("interop halted due to unavailable history: %w", err) + return err + } + if sleep > 0 { + if err := i.clock.SleepCtx(i.ctx, sleep); err != nil { + return err } - i.metrics.ActivityErrors.WithLabelValues("interop", "progress").Inc() - i.log.Error("failed to progress and record interop", "err", err) - time.Sleep(errorBackoffPeriod) - continue } - if !madeProgress { - time.Sleep(backoffPeriod) + } +} + +// waitForColdStartInit runs one cold-start initialization step. Returns +// (0, nil) if the step advanced (so the loop runs again immediately to either +// finish initialization or start progressing), (backoffPeriod, nil) if no +// progress was made yet, or (errorBackoffPeriod, nil) on any error. +// +// Cold-start init runs concurrently with chain-container startup, so every +// failure mode here (VN not yet attached, transient RPC errors, EL not +// ready) is expected during the startup window and must not kill the +// activity. Cold-start has no path to a permanent failure: none of the calls +// it makes return ErrHistoryUnavailable, and any real corruption surfaces in +// the verification loop once initialization completes. +func (i *Interop) waitForColdStartInit() (time.Duration, error) { + advanced, err := i.advanceColdStartInit() + if err != nil { + i.metrics.ActivityErrors.WithLabelValues("interop", "cold_start_init").Inc() + i.log.Warn("interop cold start step failed, will retry", "err", err) + return errorBackoffPeriod, nil + } + if !advanced { + return backoffPeriod, nil + } + i.waitingForSync = false + i.initialized.Store(true) + i.log.Info("interop cold start complete", + "activationTimestamp", i.activationTimestamp, + "verificationStartTimestamp", i.verificationStartTimestamp) + return 0, nil +} + +// progress runs one verification step. Returns (0, nil) when forward progress +// was made (so the loop runs again immediately), (backoffPeriod, nil) when +// the round was a no-op, (errorBackoffPeriod, nil) on a recoverable error, +// or a non-nil error to terminate the loop. +func (i *Interop) progress() (time.Duration, error) { + madeProgress, err := i.progressAndRecord() + if err != nil { + if errors.Is(err, cc.ErrHistoryUnavailable) { + i.metrics.ActivityErrors.WithLabelValues("interop", "history_unavailable").Inc() + i.log.Error("interop activity halted: SafeDB history unavailable on this node", "err", err, + "remediation", "reseed data dir, advance interop.activation-timestamp past the gap, or rederive from L1") + return 0, fmt.Errorf("interop halted due to unavailable history: %w", err) } + i.metrics.ActivityErrors.WithLabelValues("interop", "progress").Inc() + i.log.Error("failed to progress and record interop", "err", err) + return errorBackoffPeriod, nil + } + if !madeProgress { + return backoffPeriod, nil } + return 0, nil } // Stop stops the Interop activity. @@ -432,7 +464,7 @@ func (i *Interop) progressAndRecord() (bool, error) { return i.applyPendingTransition(*pending) } - verifyStart := time.Now() + verifyStart := i.clock.Now() output, obs, err := i.progressInterop() if err != nil { return false, err @@ -454,7 +486,7 @@ func (i *Interop) progressAndRecord() (bool, error) { } progress, applyErr := i.applyPendingTransition(pendingTx) // Record verification latency for the full round including apply. - i.metrics.InteropVerificationDuration.Observe(time.Since(verifyStart).Seconds()) + i.metrics.InteropVerificationDuration.Observe(i.clock.Since(verifyStart).Seconds()) return progress, applyErr } diff --git a/op-supernode/supernode/activity/interop/startup_test.go b/op-supernode/supernode/activity/interop/startup_test.go index 98b0abb4acd..22edd75b98c 100644 --- a/op-supernode/supernode/activity/interop/startup_test.go +++ b/op-supernode/supernode/activity/interop/startup_test.go @@ -32,7 +32,7 @@ func TestFastInit_ResumesFromVerifiedDB(t *testing.T) { require.NotNil(t, interop) defer func() { require.NoError(t, interop.Stop(context.Background())) }() - interop.fastInit() + interop.tryInitFromVerifiedDB() require.True(t, interop.initialized.Load()) require.False(t, interop.waitingForSync) require.Equal(t, uint64(501), interop.verificationStartTimestamp) @@ -58,14 +58,14 @@ func TestFastInit_ResumeBelowActivationIsAllowed(t *testing.T) { require.NotNil(t, interop) defer func() { require.NoError(t, interop.Stop(context.Background())) }() - interop.fastInit() + interop.tryInitFromVerifiedDB() require.True(t, interop.initialized.Load()) require.Equal(t, uint64(51), interop.verificationStartTimestamp, "resume always uses LastTimestamp+1, never clamps to activation") } // TestFastInit_ColdStartDefersToLoop confirms that with no verifiedDB entry -// fastInit sets waitingForSync without touching SafeDB or wall-clock. +// tryInitFromVerifiedDB sets waitingForSync without touching SafeDB or wall-clock. func TestFastInit_ColdStartDefersToLoop(t *testing.T) { dataDir := t.TempDir() @@ -74,7 +74,7 @@ func TestFastInit_ColdStartDefersToLoop(t *testing.T) { require.NotNil(t, interop) defer func() { require.NoError(t, interop.Stop(context.Background())) }() - interop.fastInit() + interop.tryInitFromVerifiedDB() require.False(t, interop.initialized.Load()) require.True(t, interop.waitingForSync) require.Zero(t, interop.verificationStartTimestamp) @@ -330,7 +330,7 @@ func TestFirstVerifiableTimestamp_PrefersVerifiedDB(t *testing.T) { // Resume picks verificationStart=201, but RPC accessor returns 200 // (the first committed timestamp) for the firstVerifiable boundary. - interop.fastInit() + interop.tryInitFromVerifiedDB() require.Equal(t, uint64(201), interop.verificationStartTimestamp) got, err := interop.firstVerifiableTimestamp() From 418d4f72b0d31584e774584d10921e452fbe5d57 Mon Sep 17 00:00:00 2001 From: Adrian Sutton Date: Mon, 18 May 2026 12:04:31 +1000 Subject: [PATCH 4/5] docs(op-supernode/interop): refresh stale comments after startup rework backfillAttempts/backfillCompleted now reference advanceColdStartInit instead of the removed runLogBackfill, and InteropTestControl's accessor list swaps the removed BackfillEndTimestamp for VerificationStartTimestamp. --- op-devstack/stack/supernode.go | 2 +- op-supernode/supernode/activity/interop/interop.go | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/op-devstack/stack/supernode.go b/op-devstack/stack/supernode.go index cd8f245a885..73da7e2be6e 100644 --- a/op-devstack/stack/supernode.go +++ b/op-devstack/stack/supernode.go @@ -15,7 +15,7 @@ type Supernode interface { // InteropActivity; see op-supernode/supernode/activity/interop for the // methods available on the returned pointer (PauseAt, Resume, // BackfillAttempts, BackfillCompleted, ActivationTimestamp, -// BackfillEndTimestamp, FirstVerifiableTimestamp, FirstSealedBlock, +// VerificationStartTimestamp, FirstVerifiableTimestamp, FirstSealedBlock, // LatestSealedBlock, ...). type InteropTestControl interface { // InteropActivity returns the current interop activity, or nil if the diff --git a/op-supernode/supernode/activity/interop/interop.go b/op-supernode/supernode/activity/interop/interop.go index 47b48413008..afde1755f5f 100644 --- a/op-supernode/supernode/activity/interop/interop.go +++ b/op-supernode/supernode/activity/interop/interop.go @@ -186,11 +186,13 @@ type Interop struct { // if the next timestamp to process is >= this value. pauseAtTimestamp atomic.Uint64 - // backfillAttempts counts the number of times runLogBackfill was invoked - // since Start. Read by integration tests to confirm the retry loop is engaged. + // backfillAttempts counts cold-start init iterations since the most + // recent Start. Read by integration tests to confirm the retry loop has + // engaged. backfillAttempts atomic.Int32 - // backfillCompleted is set to true once runLogBackfill returns nil (or was skipped - // because logBackfillDepth <= 0). Read by integration tests to gate on backfill finishing. + // backfillCompleted is set true once advanceColdStartInit finishes + // successfully. Read by integration tests to gate on cold-start init + // finishing. backfillCompleted atomic.Bool // l1Checker must be non-nil whenever observeRound runs. Production sets it From c640c56445a3ef41abf4b89e4f61b54c301fcc8f Mon Sep 17 00:00:00 2001 From: Karl Floersch Date: Mon, 18 May 2026 13:14:14 -0400 Subject: [PATCH 5/5] fix: fail fast on cold-start backfill errors --- .../supernode/activity/interop/interop.go | 15 +++++---- .../activity/interop/log_backfill.go | 4 ++- .../activity/interop/startup_test.go | 32 +++++++++++++++++++ 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/op-supernode/supernode/activity/interop/interop.go b/op-supernode/supernode/activity/interop/interop.go index afde1755f5f..db021702bc0 100644 --- a/op-supernode/supernode/activity/interop/interop.go +++ b/op-supernode/supernode/activity/interop/interop.go @@ -353,16 +353,19 @@ func (i *Interop) runLoop() error { // finish initialization or start progressing), (backoffPeriod, nil) if no // progress was made yet, or (errorBackoffPeriod, nil) on any error. // -// Cold-start init runs concurrently with chain-container startup, so every -// failure mode here (VN not yet attached, transient RPC errors, EL not -// ready) is expected during the startup window and must not kill the -// activity. Cold-start has no path to a permanent failure: none of the calls -// it makes return ErrHistoryUnavailable, and any real corruption surfaces in -// the verification loop once initialization completes. +// Cold-start init runs concurrently with chain-container startup, so +// pre-backfill readiness failures (VN not yet attached, transient RPC errors, +// EL not ready) are expected during the startup window and must not kill the +// activity. Once the SafeDB handoff has been selected and backfill starts, +// backfill range failures are fatal: retrying cannot make unavailable history +// appear without operator intervention. func (i *Interop) waitForColdStartInit() (time.Duration, error) { advanced, err := i.advanceColdStartInit() if err != nil { i.metrics.ActivityErrors.WithLabelValues("interop", "cold_start_init").Inc() + if errors.Is(err, errColdStartBackfill) { + return 0, err + } i.log.Warn("interop cold start step failed, will retry", "err", err) return errorBackoffPeriod, nil } diff --git a/op-supernode/supernode/activity/interop/log_backfill.go b/op-supernode/supernode/activity/interop/log_backfill.go index c44f9755e6e..8384f23f423 100644 --- a/op-supernode/supernode/activity/interop/log_backfill.go +++ b/op-supernode/supernode/activity/interop/log_backfill.go @@ -10,6 +10,8 @@ import ( cc "github.com/ethereum-optimism/optimism/op-supernode/supernode/chain_container" ) +var errColdStartBackfill = errors.New("cold-start backfill failed") + // advanceColdStartInit runs one best-effort pass at cold-start initialization: // it collects every chain's first SafeDB entry timestamp, picks // verificationStartTimestamp = max(activation, max_c T_c), runs backfill, and @@ -41,7 +43,7 @@ func (i *Interop) advanceColdStartInit() (bool, error) { i.initialized.Store(true) if err := i.runColdStartBackfill(verificationStart); err != nil { - return false, fmt.Errorf("backfill: %w", err) + return false, fmt.Errorf("%w: %w", errColdStartBackfill, err) } i.backfillCompleted.Store(true) return true, nil diff --git a/op-supernode/supernode/activity/interop/startup_test.go b/op-supernode/supernode/activity/interop/startup_test.go index 22edd75b98c..3a88c5a216c 100644 --- a/op-supernode/supernode/activity/interop/startup_test.go +++ b/op-supernode/supernode/activity/interop/startup_test.go @@ -309,6 +309,38 @@ func TestColdStartBackfill_GenesisClamp(t *testing.T) { "backfill must not fetch blocks before genesis") } +// TestColdStartBackfill_FailureHaltsStart confirms that once cold-start has +// selected a SafeDB handoff and started backfilling the configured history +// range, failures to fetch that range are fatal rather than retried forever. +func TestColdStartBackfill_FailureHaltsStart(t *testing.T) { + + backfillErr := errors.New("historical block unavailable") + h := newInteropTestHarness(t). + WithActivation(100). + WithLogBackfillDepth(10*time.Second). + WithChain(10, func(m *mockChainContainer) { + m.firstSafeHeadTimestamp = 120 + m.firstSafeHeadTimestampSet = true + m.outputV0Override = func(_ context.Context, _ uint64) (*eth.OutputV0, error) { + return nil, backfillErr + } + }). + Build() + h.interop.initialized.Store(false) + h.interop.verificationStartTimestamp = 0 + + done := make(chan error, 1) + go func() { done <- h.interop.Start(context.Background()) }() + + select { + case err := <-done: + require.ErrorIs(t, err, errColdStartBackfill) + require.ErrorIs(t, err, backfillErr) + case <-time.After(5 * time.Second): + t.Fatal("Start retried backfill failure instead of halting") + } +} + // TestFirstVerifiableTimestamp_PrefersVerifiedDB locks the contract that // verifiedDB.FirstTimestamp takes precedence over any later // verificationStartTimestamp set by init.