diff --git a/op-acceptance-tests/tests/supernode/interop/backfill/happy/happy_test.go b/op-acceptance-tests/tests/supernode/interop/backfill/happy/happy_test.go deleted file mode 100644 index a9546fe82bb..00000000000 --- a/op-acceptance-tests/tests/supernode/interop/backfill/happy/happy_test.go +++ /dev/null @@ -1,41 +0,0 @@ -// Package happy contains the happy-path acceptance test for interop log -// backfill. It lives in its own package (rather than a single file) so it -// runs in its own test binary, isolated from the retry-path test. -package happy - -import ( - "testing" - - "github.com/ethereum-optimism/optimism/op-acceptance-tests/tests/supernode/interop/backfill" - "github.com/ethereum-optimism/optimism/op-devstack/devtest" -) - -// TestSupernodeLogBackfill_HappyPath exercises the happy path: -// -// 1. Bring up a two-L2 supernode with interop enabled at genesis. -// 2. Let both chains accumulate more than BackfillDepth of local+cross-safe -// history. -// 3. Hot-restart only the interop activity, wiping its on-disk logs DBs. -// Because every other component (chain containers, virtual nodes, RPC -// server, other activities) keeps running, the replacement activity is -// guaranteed to see ready VNs when it starts backfilling. -// 4. Assert that the logs DBs now span [T_lo, localSafe] for each chain. -// This is the strongest evidence that backfill actually did work: every -// block in the DB was sealed by backfill, because the disk was empty. -func TestSupernodeLogBackfill_HappyPath(gt *testing.T) { - t := devtest.SerialT(gt) - sys := backfill.NewTestSystem(t) - - sys.Supernode.AwaitBackfillCompleted() - backfill.AwaitHistoryAtLeast(t, sys, backfill.MinHistoryBeforeRestart) - - sys.Supernode.RestartInterop(true) - sys.Supernode.AwaitBackfillCompleted() - - t.Require().GreaterOrEqual(sys.Supernode.BackfillAttempts(), int32(1), - "post-restart backfill should run at least once") - - sys.Supernode.AssertBackfillCovers(backfill.BackfillDepth, - sys.L2A.Escape().RollupConfig().BlockTime, - sys.L2A.ChainID(), sys.L2B.ChainID()) -} diff --git a/op-acceptance-tests/tests/supernode/interop/backfill/testutil.go b/op-acceptance-tests/tests/supernode/interop/backfill/testutil.go deleted file mode 100644 index b62d8033a78..00000000000 --- a/op-acceptance-tests/tests/supernode/interop/backfill/testutil.go +++ /dev/null @@ -1,52 +0,0 @@ -// Package backfillutil contains shared helpers used by the interop log-backfill -// acceptance tests. The individual test cases live in sibling packages -// (backfill/happy) so that each runs in its own test binary and shares no -// in-process state. -package backfill - -import ( - "time" - - "github.com/ethereum-optimism/optimism/op-devstack/devtest" - "github.com/ethereum-optimism/optimism/op-devstack/presets" -) - -// BackfillDepth is the look-back window configured on every test preset. -// Any value larger than a few block times is enough to exercise the full -// backfill path while still keeping test runtime reasonable. -const BackfillDepth = 60 * time.Second - -// MinHistoryBeforeRestart is how much local+cross-safe history each chain -// must accumulate before we trigger a RestartInterop(wipeLogsDBs=true). -// Strictly larger than BackfillDepth so backfill has a non-empty range to -// reingest — otherwise the coverage assertion is vacuous. -const MinHistoryBeforeRestart = BackfillDepth + 30*time.Second - -// NewTestSystem builds a two-L2 interop system with interop active at genesis, -// time-travel enabled, and the supernode configured to run log backfill -// with BackfillDepth on every (re)start of its interop activity. -func NewTestSystem(t devtest.T) *presets.TwoL2SupernodeInterop { - return presets.NewTwoL2SupernodeInterop(t, 0, - presets.WithTimeTravelEnabled(), - presets.WithInteropLogBackfillDepth(BackfillDepth), - ) -} - -// AwaitHistoryAtLeast blocks until both L2 chains' local-safe and -// cross-safe timestamps have advanced at least `age` past genesis. -// Intended to be called before wiping the logs DB so the subsequent -// backfill has a meaningful range to reingest. -func AwaitHistoryAtLeast(t devtest.T, sys *presets.TwoL2SupernodeInterop, age time.Duration) { - t.Helper() - ageSec := uint64(age / time.Second) - deadline := sys.GenesisTime + ageSec - t.Require().Eventuallyf(func() bool { - statusA := sys.L2ACL.SyncStatus() - statusB := sys.L2BCL.SyncStatus() - return statusA.LocalSafeL2.Time >= deadline && - statusB.LocalSafeL2.Time >= deadline && - statusA.SafeL2.Time >= deadline && - statusB.SafeL2.Time >= deadline - }, 5*time.Minute, 2*time.Second, - "both chains must accumulate local+cross safe history of at least %s", age) -} diff --git a/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go b/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go new file mode 100644 index 00000000000..43398ad8016 --- /dev/null +++ b/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go @@ -0,0 +1,91 @@ +// Package startup_resync contains acceptance tests for the op-supernode +// interop startup rework's cold-start resync path: stopping the supernode, +// deleting its on-disk data dir, and starting a fresh supernode against the +// same chain containers and virtual nodes. +package startup_resync + +import ( + "testing" + "time" + + "github.com/ethereum-optimism/optimism/op-devstack/devtest" + "github.com/ethereum-optimism/optimism/op-devstack/dsl" + "github.com/ethereum-optimism/optimism/op-devstack/presets" + "github.com/ethereum-optimism/optimism/op-supervisor/supervisor/types" +) + +const ( + l2BlockTime = uint64(1) + backfillDepth = 3 * time.Second + preRestartFinalized = uint64(5) +) + +// TestSupernodeResyncResumesAtActivation_PostActivation drives a full +// supernode data-dir wipe after the chain has crossed activation, and +// asserts that cross-safe keeps advancing post-restart and that the +// cold-start backfill restored history into the logs DB. +func TestSupernodeResyncResumesAtActivation_PostActivation(gt *testing.T) { + t := devtest.SerialT(gt) + sys := presets.NewTwoL2SupernodeInterop(t, 0, + presets.WithUniformL2BlockTimes(l2BlockTime), + presets.WithInteropLogBackfillDepth(backfillDepth), + ) + + sys.Supernode.AwaitBackfillCompleted() + + // Setup: let L2 finalized advance several blocks on both chains. On + // restart, op-node may drop back as part of its safe start process, + // but won't go past the finalized head. With finalized well past + // genesis the post-restart cold-start backfill has a real window to + // populate, instead of collapsing to empty against a re-recorded + // genesis SafeDB entry. + dsl.CheckAll(t, + sys.L2ACL.AdvancedFn(types.Finalized, preRestartFinalized, 180), + sys.L2BCL.AdvancedFn(types.Finalized, preRestartFinalized, 180), + ) + + sys.Supernode.RestartWithFreshDataDir() + sys.Supernode.AwaitBackfillCompleted() + + dsl.CheckAll(t, + sys.L2ACL.AdvancedFn(types.CrossSafe, 1, 60), + sys.L2BCL.AdvancedFn(types.CrossSafe, 1, 60), + ) + + // Verify the cold-start backfill repopulated the logs DB. + sys.Supernode.AssertBackfillCovers(backfillDepth, l2BlockTime, + sys.L2A.ChainID(), sys.L2B.ChainID()) +} + +// TestSupernodeResyncSchedulesAtActivation_PreActivation drives a full +// supernode data-dir wipe while interop is scheduled but not yet active, +// and asserts that cold-start init parks the verifier at the (future) +// activation timestamp while cross-safe keeps advancing on both chains. +func TestSupernodeResyncSchedulesAtActivation_PreActivation(gt *testing.T) { + t := devtest.SerialT(gt) + // 60-minute delay: ensures the chain never approaches activation during + // the test, so we always exercise the genuine pre-activation cold-start + // path regardless of CI scheduling variance. + sys := presets.NewTwoL2SupernodeInterop(t, 60*60, + presets.WithUniformL2BlockTimes(l2BlockTime), + presets.WithInteropLogBackfillDepth(backfillDepth), + ) + + sys.Supernode.AwaitBackfillCompleted() + activation := sys.Supernode.ActivationTimestamp() + + // Setup: let local-safe accumulate enough that op-node's SafeDB has + // entries to serve to the post-restart cold-start init. + dsl.CheckAll(t, + sys.L2ACL.AdvancedFn(types.LocalSafe, 2, 30), + sys.L2BCL.AdvancedFn(types.LocalSafe, 2, 30), + ) + + sys.Supernode.RestartWithFreshDataDir() + sys.Supernode.AwaitVerificationStartsAt(activation) + + dsl.CheckAll(t, + sys.L2ACL.AdvancedFn(types.CrossSafe, 1, 60), + sys.L2BCL.AdvancedFn(types.CrossSafe, 1, 60), + ) +} diff --git a/op-devstack/dsl/supernode.go b/op-devstack/dsl/supernode.go index 6dd2aa61cd2..b7354ce2717 100644 --- a/op-devstack/dsl/supernode.go +++ b/op-devstack/dsl/supernode.go @@ -16,7 +16,7 @@ import ( type Supernode struct { commonImpl inner stack.Supernode - testControl stack.InteropTestControl + testControl stack.SupernodeTestControl } // NewSupernode creates a new Supernode DSL wrapper @@ -29,7 +29,7 @@ func NewSupernode(inner stack.Supernode) *Supernode { // NewSupernodeWithTestControl creates a new Supernode DSL wrapper with test control support. // The testControl parameter can be nil if no test control is needed. -func NewSupernodeWithTestControl(inner stack.Supernode, testControl stack.InteropTestControl) *Supernode { +func NewSupernodeWithTestControl(inner stack.Supernode, testControl stack.SupernodeTestControl) *Supernode { return &Supernode{ commonImpl: commonFromT(inner.T()), inner: inner, @@ -129,19 +129,16 @@ func (s *Supernode) ResumeInterop() { s.interopActivity().Resume() } -// RestartInterop stops the running interop activity, optionally wipes its -// on-disk logs DBs, and launches a fresh instance against the still-running -// supernode. The HTTP server, chain containers, virtual nodes, and all other -// activities keep running across the restart. Setting wipeLogsDBs=true forces -// the fresh activity to reconstruct its database via log backfill from the -// virtual nodes, making this the primary primitive for exercising backfill -// in tests. -// Requires the Supernode to be created with NewSupernodeWithTestControl. -func (s *Supernode) RestartInterop(wipeLogsDBs bool) { - s.require.NotNil(s.testControl, "RestartInterop requires test control; use NewSupernodeWithTestControl") - s.log.Info("restarting interop activity", "wipeLogsDBs", wipeLogsDBs) - err := s.testControl.RestartInteropActivity(wipeLogsDBs) - s.require.NoError(err, "failed to restart interop activity") +// RestartWithFreshDataDir stops the supernode, deletes its on-disk data +// directory in full, and starts a fresh supernode against the same chain +// containers, virtual nodes, and externally-visible RPC address. +// Requires NewSupernodeWithTestControl. +func (s *Supernode) RestartWithFreshDataDir() { + s.require.NotNil(s.testControl, + "RestartWithFreshDataDir requires test control; use NewSupernodeWithTestControl") + s.log.Info("restarting supernode with fresh data dir") + err := s.testControl.RestartWithFreshDataDir() + s.require.NoError(err, "failed to restart supernode with fresh data dir") } // BackfillAttempts returns the number of log-backfill attempts since the @@ -178,6 +175,37 @@ func (s *Supernode) AwaitBackfillCompleted() { s.require.NoError(err, "backfill did not complete in time") } +// ActivationTimestamp returns the configured interop activation timestamp. +// Requires NewSupernodeWithTestControl. +func (s *Supernode) ActivationTimestamp() uint64 { + return s.interopActivity().ActivationTimestamp() +} + +// VerificationStartTimestamp returns the L2 timestamp the current interop +// activity began verifying at. Returns 0 before cold-start init completes. +// Requires NewSupernodeWithTestControl. +func (s *Supernode) VerificationStartTimestamp() uint64 { + return s.interopActivity().VerificationStartTimestamp() +} + +// AwaitVerificationStartsAt blocks until cold-start init completes, then +// asserts VerificationStartTimestamp equals expected. +// Requires NewSupernodeWithTestControl. +func (s *Supernode) AwaitVerificationStartsAt(expected uint64) { + ia := s.interopActivity() + ctx, cancel := context.WithTimeout(s.ctx, 3*DefaultTimeout) + defer cancel() + err := wait.For(ctx, 500*time.Millisecond, func() (bool, error) { + return ia.BackfillCompleted(), nil + }) + s.require.NoError(err, "cold-start initialization did not complete in time") + actual := ia.VerificationStartTimestamp() + s.require.Equalf(expected, actual, + "verificationStartTimestamp mismatch after cold-start init: expected %d, got %d", + expected, actual) + s.log.Info("verification start timestamp confirmed", "expected", expected, "actual", actual) +} + // AssertBackfillCovers verifies, for each supplied chain, that the interop // logs DB contains blocks spanning from a first-seal at or near the expected // T_lo all the way to a latest-seal at or near the safe tip. Specifically it diff --git a/op-devstack/presets/options.go b/op-devstack/presets/options.go index a4cd786eef6..eabcc890df3 100644 --- a/op-devstack/presets/options.go +++ b/op-devstack/presets/options.go @@ -289,6 +289,12 @@ func WithL2BlockTimes(blockTimes map[eth.ChainID]uint64) Option { return WithDeployerOptions(sysgo.WithL2BlockTimes(blockTimes)) } +// WithUniformL2BlockTimes configures the same L2 block time (in seconds) on +// every configured L2 chain via the deployer. +func WithUniformL2BlockTimes(seconds uint64) Option { + return WithDeployerOptions(sysgo.WithUniformL2BlockTimes(seconds)) +} + // WithInteropLogBackfillDepth configures the supernode to pre-ingest // initiating-message logs backward from the tip by the given duration at // startup. Zero disables backfill (the default). diff --git a/op-devstack/stack/supernode.go b/op-devstack/stack/supernode.go index 73da7e2be6e..15c7da235ac 100644 --- a/op-devstack/stack/supernode.go +++ b/op-devstack/stack/supernode.go @@ -10,23 +10,17 @@ type Supernode interface { QueryAPI() apis.SupernodeQueryAPI } -// InteropTestControl is the narrow integration-test surface on a running -// supernode. Tests get direct access to the interop activity via -// InteropActivity; see op-supernode/supernode/activity/interop for the -// methods available on the returned pointer (PauseAt, Resume, -// BackfillAttempts, BackfillCompleted, ActivationTimestamp, -// VerificationStartTimestamp, FirstVerifiableTimestamp, FirstSealedBlock, -// LatestSealedBlock, ...). -type InteropTestControl interface { +// SupernodeTestControl is the integration-test surface on a running +// supernode. See op-supernode/supernode/activity/interop for the methods +// available on the InteropActivity pointer. +type SupernodeTestControl interface { // InteropActivity returns the current interop activity, or nil if the - // supernode is not running or interop is not configured. Callers must - // not cache the pointer across RestartInteropActivity, which swaps the - // activity for a fresh instance. + // supernode is stopped or interop is not configured. Do not cache the + // pointer across RestartWithFreshDataDir. InteropActivity() *interop.Interop - // RestartInteropActivity stops the running interop activity, optionally - // wipes its on-disk logs DBs, and launches a fresh instance against the - // still-running supernode (HTTP server, chain containers, and all other - // activities remain up). - RestartInteropActivity(wipeLogsDBs bool) error + // RestartWithFreshDataDir stops the supernode, deletes its on-disk + // data directory, and starts a fresh supernode against the same chain + // containers, virtual nodes, and externally-visible RPC address. + RestartWithFreshDataDir() error } diff --git a/op-devstack/sysgo/deployer.go b/op-devstack/sysgo/deployer.go index 10cd0561faf..13e481ae605 100644 --- a/op-devstack/sysgo/deployer.go +++ b/op-devstack/sysgo/deployer.go @@ -357,6 +357,16 @@ func WithL2BlockTimes(blockTimes map[eth.ChainID]uint64) DeployerOption { } } +// WithUniformL2BlockTimes sets the same L2 block time (in seconds) on every +// configured L2 chain. +func WithUniformL2BlockTimes(seconds uint64) DeployerOption { + return func(_ devtest.T, _ devkeys.Keys, builder intentbuilder.Builder) { + for _, l2Cfg := range builder.L2s() { + l2Cfg.WithBlockTime(seconds) + } + } +} + // WithFinalizationPeriodSeconds overrides the number of L1 blocks in a sequencing window, applied to all L2s. func WithFinalizationPeriodSeconds(n uint64) DeployerOption { return func(p devtest.T, keys devkeys.Keys, builder intentbuilder.Builder) { diff --git a/op-devstack/sysgo/l2_cl_supernode.go b/op-devstack/sysgo/l2_cl_supernode.go index 2112117ea03..797a4607d70 100644 --- a/op-devstack/sysgo/l2_cl_supernode.go +++ b/op-devstack/sysgo/l2_cl_supernode.go @@ -3,6 +3,8 @@ package sysgo import ( "context" "errors" + "fmt" + "os" "sync" "time" @@ -11,6 +13,7 @@ import ( "github.com/ethereum-optimism/optimism/op-devstack/devtest" "github.com/ethereum-optimism/optimism/op-node/config" "github.com/ethereum-optimism/optimism/op-service/eth" + "github.com/ethereum-optimism/optimism/op-service/testutils/tcpproxy" snconfig "github.com/ethereum-optimism/optimism/op-supernode/config" "github.com/ethereum-optimism/optimism/op-supernode/supernode" "github.com/ethereum-optimism/optimism/op-supernode/supernode/activity/interop" @@ -22,6 +25,7 @@ type SuperNode struct { mu sync.Mutex sn *supernode.Supernode cancel context.CancelFunc + httpProxy *tcpproxy.Proxy userRPC string interopEndpoint string interopJwtSecret eth.Bytes32 @@ -49,6 +53,14 @@ func (n *SuperNode) InteropRPC() (endpoint string, jwtSecret eth.Bytes32) { func (n *SuperNode) Start() { n.mu.Lock() defer n.mu.Unlock() + n.startLocked() +} + +// startLocked brings up the supernode and points the long-lived httpProxy +// at its newly-bound RPC port. The proxy is created on first start and +// reused so external callers see a stable URL across restarts. Caller must +// hold n.mu. +func (n *SuperNode) startLocked() { if n.sn != nil { n.logger.Warn("Supernode already started") return @@ -56,6 +68,17 @@ func (n *SuperNode) Start() { n.p.Require().NotNil(n.snCfg, "supernode CLI config required") + if n.httpProxy == nil { + n.httpProxy = tcpproxy.New(n.logger.New("proxy", "supernode-http")) + n.p.Require().NoError(n.httpProxy.Start(), "supernode http proxy failed to start") + n.p.Cleanup(func() { + _ = n.httpProxy.Close() + }) + base := "http://" + n.httpProxy.Addr() + n.userRPC = base + n.interopEndpoint = base + } + ctx, cancel := context.WithCancel(n.p.Ctx()) exitFn := func(err error) { n.p.Errorf("supernode critical error: %v", err) } sn, err := supernode.New(ctx, n.logger, "devstack", exitFn, n.snCfg, n.vnCfgs) @@ -67,32 +90,35 @@ func (n *SuperNode) Start() { addr, err := n.sn.WaitRPCAddr(ctx) n.p.Require().NoError(err, "supernode failed to bind RPC address") - base := "http://" + addr - n.userRPC = base - n.interopEndpoint = base + n.httpProxy.SetUpstream(ProxyAddr(n.p.Require(), "http://"+addr)) } func (n *SuperNode) Stop() { n.mu.Lock() defer n.mu.Unlock() + n.stopLocked() +} + +// stopLocked tears down the supernode instance, leaving httpProxy in place +// so a later startLocked can repoint it. Caller must hold n.mu. +func (n *SuperNode) stopLocked() { if n.sn == nil { n.logger.Warn("Supernode already stopped") return } if n.cancel != nil { n.cancel() + n.cancel = nil } - // Attempt graceful stop stopCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() _ = n.sn.Stop(stopCtx) n.sn = nil } -// InteropActivity returns the interop activity running inside the supernode, -// or nil if the supernode is stopped or has no interop activity. Callers must -// not cache the returned pointer across RestartInteropActivity, which swaps -// the activity for a fresh instance. For integration test control only. +// InteropActivity returns the interop activity, or nil if the supernode is +// stopped or has no interop activity. The pointer is bound to the current +// instance; do not cache across RestartWithFreshDataDir. Test-only. func (n *SuperNode) InteropActivity() *interop.Interop { n.mu.Lock() defer n.mu.Unlock() @@ -102,16 +128,28 @@ func (n *SuperNode) InteropActivity() *interop.Interop { return n.sn.InteropActivity() } -// RestartInteropActivity stops the running interop activity, optionally -// wipes its on-disk logs DBs, and launches a fresh instance against the -// still-running supernode. For integration test control only. -func (n *SuperNode) RestartInteropActivity(wipeLogsDBs bool) error { +// RestartWithFreshDataDir stops the supernode, deletes its on-disk data +// directory, and starts a fresh supernode against the same chain +// containers, virtual nodes, and externally-visible RPC address. Test-only. +func (n *SuperNode) RestartWithFreshDataDir() error { n.mu.Lock() defer n.mu.Unlock() if n.sn == nil { return errSupernodeNotRunning } - return n.sn.RestartInteropActivity(wipeLogsDBs) + if n.snCfg == nil || n.snCfg.DataDir == "" { + return errors.New("sysgo: RestartWithFreshDataDir requires a configured supernode DataDir") + } + n.logger.Info("restarting supernode with fresh data dir", "data_dir", n.snCfg.DataDir) + n.stopLocked() + if err := os.RemoveAll(n.snCfg.DataDir); err != nil { + return fmt.Errorf("sysgo: wipe supernode data dir %s: %w", n.snCfg.DataDir, err) + } + if err := os.MkdirAll(n.snCfg.DataDir, 0o755); err != nil { + return fmt.Errorf("sysgo: recreate supernode data dir %s: %w", n.snCfg.DataDir, err) + } + n.startLocked() + return nil } // SuperNodeProxy is a thin wrapper that points to a shared supernode instance. diff --git a/op-supernode/supernode/chain_container/chain_container.go b/op-supernode/supernode/chain_container/chain_container.go index dc8aed72e8d..1ea117475a9 100644 --- a/op-supernode/supernode/chain_container/chain_container.go +++ b/op-supernode/supernode/chain_container/chain_container.go @@ -150,10 +150,7 @@ type simpleChainContainer struct { rollupClient *sources.RollupClient // In-proc rollup RPC client bound to rpcHandler metrics *resources.SupernodeMetrics - // verifiersMu guards writes and reads of the verifiers slice. Concurrent - // readers (VerifiedAt, VerifierCurrentL1s) can race with the test-only - // ReplaceVerifier path used by RestartInteropActivity, which swaps a - // verifier while the chain container is still running. + // verifiersMu guards writes and reads of the verifiers slice. verifiersMu sync.RWMutex verifiers []activity.VerificationActivity onReset ResetCallback // Called when chain resets to notify activities @@ -234,23 +231,6 @@ func (c *simpleChainContainer) RegisterVerifier(v activity.VerificationActivity) c.verifiers = append(c.verifiers, v) } -// ReplaceVerifier swaps a previously-registered verifier for a new one by -// pointer identity. Returns true if a replacement occurred. Intended for -// integration-test orchestration that restarts a single activity while the -// chain container keeps running. Not part of the ChainContainer interface -// because production code has no reason to replace verifiers. -func (c *simpleChainContainer) ReplaceVerifier(old, new activity.VerificationActivity) bool { - c.verifiersMu.Lock() - defer c.verifiersMu.Unlock() - for i, v := range c.verifiers { - if v == old { - c.verifiers[i] = new - return true - } - } - return false -} - func (c *simpleChainContainer) VerifierCurrentL1s() []eth.BlockID { c.verifiersMu.RLock() defer c.verifiersMu.RUnlock() diff --git a/op-supernode/supernode/supernode.go b/op-supernode/supernode/supernode.go index 95806a225d3..4a0e26ca660 100644 --- a/op-supernode/supernode/supernode.go +++ b/op-supernode/supernode/supernode.go @@ -36,10 +36,7 @@ type Supernode struct { stopped bool cfg *config.CLIConfig chains map[eth.ChainID]cc.InteropChain - // activitiesMu guards reads and writes of the activities slice. Concurrent - // readers (onChainReset, InteropActivity, Stop) can race with the - // test-only RestartInteropActivity path that swaps the interop activity - // while other activities and chain containers are still running. + // activitiesMu guards reads and writes of the activities slice. activitiesMu sync.RWMutex activities []activity.Activity rootRPC *oprpc.Handler @@ -55,15 +52,6 @@ type Supernode struct { supernodeMetrics *resources.SupernodeMetrics // cached address when available rpcAddr string - - // Cached parameters needed to reconstruct the interop activity in - // RestartInteropActivity (test-only). See supernode_test_access.go. - interopActivationTs *uint64 - interopMsgExpiryWindow uint64 - // lifecycleCtx is the parent context for all activity goroutines, captured - // from Start(). RestartInteropActivity uses it to re-launch the interop - // activity without disturbing other activities. - lifecycleCtx context.Context } func New(ctx context.Context, log gethlog.Logger, version string, requestStop context.CancelCauseFunc, cfg *config.CLIConfig, vnCfgs map[eth.ChainID]*opnodecfg.Config) (*Supernode, error) { @@ -138,8 +126,6 @@ func New(ctx context.Context, log gethlog.Logger, version string, requestStop co } interopActivity = interop.New(log.New("activity", "interop"), *interopActivationTimestamp, msgExpiryWindow, s.chains, cfg.DataDir, s.l1Client, cfg.InteropLogBackfillDepth, s.supernodeMetrics) verifiedReader = interopActivity - s.interopActivationTs = interopActivationTimestamp - s.interopMsgExpiryWindow = msgExpiryWindow } // Order in this slice governs Start/Stop ordering; interop is appended @@ -243,7 +229,6 @@ func (s *Supernode) Start(ctx context.Context) error { // found cancel == nil) before Start() had a chance to initialize. var lifecycleCtx context.Context lifecycleCtx, s.lifecycleCancel = context.WithCancel(ctx) - s.lifecycleCtx = lifecycleCtx if s.httpServer != nil { s.wg.Add(1) diff --git a/op-supernode/supernode/supernode_test_access.go b/op-supernode/supernode/supernode_test_access.go index ca92ad03f65..efdaa5a1373 100644 --- a/op-supernode/supernode/supernode_test_access.go +++ b/op-supernode/supernode/supernode_test_access.go @@ -1,27 +1,15 @@ package supernode -// This file collects Supernode methods that expose test-only access to the -// interop activity. They must not be called by production code paths. Keeping -// them in one file makes the test-only surface easy to audit alongside -// interop/interop_test_access.go. +// Test-only Supernode methods. Production code paths must not call these. import ( - "context" - "errors" - "fmt" - "os" - "path/filepath" - - "github.com/ethereum-optimism/optimism/op-supernode/supernode/activity" "github.com/ethereum-optimism/optimism/op-supernode/supernode/activity/interop" ) -var errNoInteropActivity = errors.New("supernode: no interop activity") - -// InteropActivity returns the single interop activity registered with the -// supernode, or nil if interop is not configured or has not started yet. -// Callers must not cache the returned pointer across RestartInteropActivity; -// that path swaps the underlying activity for a fresh instance. +// InteropActivity returns the registered interop activity, or nil if interop +// is not configured or has not started yet. The pointer is bound to the +// current Supernode instance; tests that tear the supernode down must +// re-fetch after restart. func (s *Supernode) InteropActivity() *interop.Interop { s.activitiesMu.RLock() defer s.activitiesMu.RUnlock() @@ -32,120 +20,3 @@ func (s *Supernode) InteropActivity() *interop.Interop { } return nil } - -// verifierReplacer is the subset of simpleChainContainer we depend on in -// RestartInteropActivity to swap a verifier registration without touching -// the public ChainContainer interface. -type verifierReplacer interface { - ReplaceVerifier(old, new activity.VerificationActivity) bool -} - -// RestartInteropActivity stops the running interop activity (if any), -// optionally wipes its on-disk logs DB files, constructs a fresh instance -// from the originally-configured parameters, re-registers it with each chain -// container as a verifier, and starts it under the supernode's existing -// lifecycle context. The HTTP server, chain containers, and all other -// activities keep running. This is the core primitive for tests that want -// to exercise log backfill against a running, ready cluster without the -// cost and flakiness of restarting the entire supernode. -// -// Any test-only mutations on the old activity are discarded when it is -// Stopped. -func (s *Supernode) RestartInteropActivity(wipeLogsDBs bool) error { - if s.lifecycleCtx == nil { - return fmt.Errorf("supernode: RestartInteropActivity called before Start") - } - if s.interopActivationTs == nil { - return fmt.Errorf("supernode: RestartInteropActivity called but interop was never configured") - } - // Validate the DataDir precondition up front so a misconfigured call fails - // before we tear down the old activity and lose its in-memory state. - if wipeLogsDBs && (s.cfg == nil || s.cfg.DataDir == "") { - return fmt.Errorf("supernode: cannot wipe logs DBs without a configured DataDir") - } - - old := s.InteropActivity() - if old == nil { - return errNoInteropActivity - } - - // Stop the old activity: cancels its ctx, waits its loop to exit on its - // own, then closes verifiedDB and all logs DBs. Safe to ignore errors as - // Stop only surfaces close errors and we're about to wipe/reopen. - _ = old.Stop(context.Background()) - - if wipeLogsDBs { - for chainID := range s.chains { - chainDir := filepath.Join(s.cfg.DataDir, fmt.Sprintf("chain-%s", chainID)) - if err := os.RemoveAll(chainDir); err != nil { - return fmt.Errorf("supernode: wipe chain dir %s: %w", chainDir, err) - } - s.log.Info("wiped interop chain data dir", "chain", chainID, "path", chainDir) - } - } - - newIA := interop.New( - s.log.New("activity", "interop"), - *s.interopActivationTs, - s.interopMsgExpiryWindow, - s.chains, - s.cfg.DataDir, - s.l1Client, - s.cfg.InteropLogBackfillDepth, - s.supernodeMetrics, - ) - if newIA == nil { - return fmt.Errorf("supernode: failed to construct replacement interop activity") - } - - // Replace in s.activities so Reset-callback fan-out and test-only accessors - // find the new instance. Locked because onChainReset and InteropActivity - // iterate this slice from concurrent goroutines (chain containers are still - // running across the restart). - replaced := false - s.activitiesMu.Lock() - for i, a := range s.activities { - if a == old { - s.activities[i] = newIA - replaced = true - break - } - } - s.activitiesMu.Unlock() - if !replaced { - return fmt.Errorf("supernode: old interop activity not found in activities slice") - } - - // Swap verifier registration on every chain container. - for chainID, chain := range s.chains { - r, ok := chain.(verifierReplacer) - if !ok { - return fmt.Errorf("supernode: chain container for %s does not support ReplaceVerifier", chainID) - } - if !r.ReplaceVerifier(old, newIA) { - return fmt.Errorf("supernode: old interop activity not registered as verifier on chain %s", chainID) - } - } - - // Launch the replacement activity on the existing lifecycle context so - // it shares the supernode's shutdown path. Wait-group participation mirrors - // how activities are launched in Start(). - s.wg.Add(1) - go func() { - defer s.wg.Done() - err := newIA.Start(s.lifecycleCtx) - switch err { - case nil: - s.log.Error("activity quit unexpectedly", "name", newIA.Name()) - case context.Canceled: - s.log.Info("activity closing due to cancelled context", "name", newIA.Name()) - case context.DeadlineExceeded: - s.log.Warn("activity quit due to deadline exceeded", "name", newIA.Name()) - default: - s.log.Error("error running restarted interop activity", "name", newIA.Name(), "error", err) - } - }() - - s.log.Info("interop activity restarted", "wipedLogsDBs", wipeLogsDBs) - return nil -}