diff --git a/justfile b/justfile index 347286a7c81..3ef8a80878f 100644 --- a/justfile +++ b/justfile @@ -346,8 +346,8 @@ update-op-geth: # Build all Rust binaries (release) for sysgo tests. build-rust-release: cd rust && cargo build --release --bin kona-node --bin kona-host --bin op-reth - cd op-rbuilder && cargo build --release -p op-rbuilder --bin op-rbuilder - cd rollup-boost && cargo build --release -p rollup-boost --bin rollup-boost + cd rust/op-rbuilder && cargo build --release -p op-rbuilder --bin op-rbuilder + cd rust/rollup-boost && cargo build --release -p rollup-boost --bin rollup-boost # Checks that locked NUT bundles have not been modified. check-nut-locks: diff --git a/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go b/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go index 43398ad8016..0dfac7cbd08 100644 --- a/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go +++ b/op-acceptance-tests/tests/supernode/interop/startup_resync/startup_resync_test.go @@ -1,7 +1,5 @@ // Package startup_resync contains acceptance tests for the op-supernode -// interop startup rework's cold-start resync path: stopping the supernode, -// deleting its on-disk data dir, and starting a fresh supernode against the -// same chain containers and virtual nodes. +// interop startup rework's cold-start resync path. package startup_resync import ( @@ -20,72 +18,92 @@ const ( preRestartFinalized = uint64(5) ) -// TestSupernodeResyncResumesAtActivation_PostActivation drives a full -// supernode data-dir wipe after the chain has crossed activation, and -// asserts that cross-safe keeps advancing post-restart and that the -// cold-start backfill restored history into the logs DB. +// TestSupernodeResyncResumesAtActivation_PostActivation wipes the verifier +// supernode's data dir after the chain has crossed activation and asserts +// cross-safe resumes on the verifier. The "EL data wiped" subtest +// additionally wipes the verifier ELs so they must execution-layer-sync +// from the chains' sequencer ELs. func TestSupernodeResyncResumesAtActivation_PostActivation(gt *testing.T) { + for _, tc := range []struct { + name string + restartOpts []func(*dsl.RestartOpts) + }{ + {"EL data intact", nil}, + {"EL data wiped", []func(*dsl.RestartOpts){dsl.WithELWiped}}, + } { + gt.Run(tc.name, func(gt *testing.T) { + runPostActivationResync(gt, tc.restartOpts) + }) + } +} + +// TestSupernodeResyncSchedulesAtActivation_PreActivation wipes the verifier +// supernode's data dir while interop is still scheduled and asserts +// cold-start parks the verifier at activation. The "EL data wiped" subtest +// additionally wipes the verifier ELs. +func TestSupernodeResyncSchedulesAtActivation_PreActivation(gt *testing.T) { + for _, tc := range []struct { + name string + restartOpts []func(*dsl.RestartOpts) + }{ + {"EL data intact", nil}, + {"EL data wiped", []func(*dsl.RestartOpts){dsl.WithELWiped}}, + } { + gt.Run(tc.name, func(gt *testing.T) { + runPreActivationResync(gt, tc.restartOpts) + }) + } +} + +func runPostActivationResync(gt *testing.T, restartOpts []func(*dsl.RestartOpts)) { t := devtest.SerialT(gt) - sys := presets.NewTwoL2SupernodeInterop(t, 0, + sys := presets.NewTwoL2SupernodeInteropPeerEL(t, 0, presets.WithUniformL2BlockTimes(l2BlockTime), presets.WithInteropLogBackfillDepth(backfillDepth), ) + sys.VerifierSupernode.AwaitBackfillCompleted() - sys.Supernode.AwaitBackfillCompleted() - - // Setup: let L2 finalized advance several blocks on both chains. On - // restart, op-node may drop back as part of its safe start process, - // but won't go past the finalized head. With finalized well past - // genesis the post-restart cold-start backfill has a real window to - // populate, instead of collapsing to empty against a re-recorded - // genesis SafeDB entry. dsl.CheckAll(t, - sys.L2ACL.AdvancedFn(types.Finalized, preRestartFinalized, 180), - sys.L2BCL.AdvancedFn(types.Finalized, preRestartFinalized, 180), + sys.VerifierL2ACL.AdvancedFn(types.Finalized, preRestartFinalized, 180), + sys.VerifierL2BCL.AdvancedFn(types.Finalized, preRestartFinalized, 180), ) - sys.Supernode.RestartWithFreshDataDir() - sys.Supernode.AwaitBackfillCompleted() + activation := sys.VerifierSupernode.ActivationTimestamp() + sys.VerifierSupernode.RestartWithFreshDataDir(restartOpts...) + sys.VerifierSupernode.AwaitVerificationStartsAtOrAfter(activation) + sys.VerifierSupernode.AwaitBackfillCompleted() dsl.CheckAll(t, - sys.L2ACL.AdvancedFn(types.CrossSafe, 1, 60), - sys.L2BCL.AdvancedFn(types.CrossSafe, 1, 60), + sys.VerifierL2ACL.AdvancedFn(types.CrossSafe, 1, 60), + sys.VerifierL2BCL.AdvancedFn(types.CrossSafe, 1, 60), ) - // Verify the cold-start backfill repopulated the logs DB. - sys.Supernode.AssertBackfillCovers(backfillDepth, l2BlockTime, + sys.VerifierSupernode.AssertBackfillCovers(backfillDepth, l2BlockTime, sys.L2A.ChainID(), sys.L2B.ChainID()) } -// TestSupernodeResyncSchedulesAtActivation_PreActivation drives a full -// supernode data-dir wipe while interop is scheduled but not yet active, -// and asserts that cold-start init parks the verifier at the (future) -// activation timestamp while cross-safe keeps advancing on both chains. -func TestSupernodeResyncSchedulesAtActivation_PreActivation(gt *testing.T) { +func runPreActivationResync(gt *testing.T, restartOpts []func(*dsl.RestartOpts)) { t := devtest.SerialT(gt) - // 60-minute delay: ensures the chain never approaches activation during - // the test, so we always exercise the genuine pre-activation cold-start - // path regardless of CI scheduling variance. - sys := presets.NewTwoL2SupernodeInterop(t, 60*60, + // Delay activation by an hour so the chain stays well below it throughout + // the test, and cold-start always parks at the future activation timestamp + // regardless of CI scheduling variance. + sys := presets.NewTwoL2SupernodeInteropPeerEL(t, uint64(60*60), presets.WithUniformL2BlockTimes(l2BlockTime), presets.WithInteropLogBackfillDepth(backfillDepth), ) + sys.VerifierSupernode.AwaitBackfillCompleted() + activation := sys.VerifierSupernode.ActivationTimestamp() - sys.Supernode.AwaitBackfillCompleted() - activation := sys.Supernode.ActivationTimestamp() - - // Setup: let local-safe accumulate enough that op-node's SafeDB has - // entries to serve to the post-restart cold-start init. dsl.CheckAll(t, - sys.L2ACL.AdvancedFn(types.LocalSafe, 2, 30), - sys.L2BCL.AdvancedFn(types.LocalSafe, 2, 30), + sys.VerifierL2ACL.AdvancedFn(types.LocalSafe, 2, 60), + sys.VerifierL2BCL.AdvancedFn(types.LocalSafe, 2, 60), ) - sys.Supernode.RestartWithFreshDataDir() - sys.Supernode.AwaitVerificationStartsAt(activation) + sys.VerifierSupernode.RestartWithFreshDataDir(restartOpts...) + sys.VerifierSupernode.AwaitVerificationStartsAt(activation) dsl.CheckAll(t, - sys.L2ACL.AdvancedFn(types.CrossSafe, 1, 60), - sys.L2BCL.AdvancedFn(types.CrossSafe, 1, 60), + sys.VerifierL2ACL.AdvancedFn(types.CrossSafe, 1, 60), + sys.VerifierL2BCL.AdvancedFn(types.CrossSafe, 1, 60), ) } diff --git a/op-devstack/dsl/l2_cl.go b/op-devstack/dsl/l2_cl.go index 5279510d8f0..167c2cc4d42 100644 --- a/op-devstack/dsl/l2_cl.go +++ b/op-devstack/dsl/l2_cl.go @@ -22,16 +22,14 @@ import ( // L2CLNode wraps a stack.L2CLNode interface for DSL operations type L2CLNode struct { commonImpl - inner stack.L2CLNode - managedPeers map[string]*L2CLNode + inner stack.L2CLNode } // NewL2CLNode creates a new L2CLNode DSL wrapper func NewL2CLNode(inner stack.L2CLNode) *L2CLNode { return &L2CLNode{ - commonImpl: commonFromT(inner.T()), - inner: inner, - managedPeers: make(map[string]*L2CLNode), + commonImpl: commonFromT(inner.T()), + inner: inner, } } @@ -56,7 +54,6 @@ func (cl *L2CLNode) Start() { lifecycle, ok := cl.inner.(stack.Lifecycle) cl.require.Truef(ok, "L2CL node %s is not lifecycle-controllable", cl.inner.Name()) lifecycle.Start() - cl.restoreManagedPeers() } func (cl *L2CLNode) Stop() { @@ -65,18 +62,6 @@ func (cl *L2CLNode) Stop() { lifecycle.Stop() } -func (cl *L2CLNode) ManagePeer(peer *L2CLNode) { - cl.managedPeers[peer.Name()] = peer - peer.managedPeers[cl.Name()] = cl -} - -func (cl *L2CLNode) restoreManagedPeers() { - for _, peer := range cl.managedPeers { - cl.connectPeerRaw(peer) - peer.connectPeerRaw(cl) - } -} - func (cl *L2CLNode) StartSequencer() { // The op-node Sequencer.Start RPC requires the caller to pass the hash of op-node's // current unsafe head. Reading the head and issuing the start call are two separate @@ -449,13 +434,9 @@ func (cl *L2CLNode) Peers() *apis.PeerDump { return peerDump } +// DisconnectPeer one-shot disconnects from peer. Restart-survivability is +// handled in sysgo via the peer registry. func (cl *L2CLNode) DisconnectPeer(peer *L2CLNode) { - delete(cl.managedPeers, peer.Name()) - delete(peer.managedPeers, cl.Name()) - cl.disconnectPeerRaw(peer) -} - -func (cl *L2CLNode) disconnectPeerRaw(peer *L2CLNode) { peerInfo := peer.PeerInfo() err := retry.Do0(cl.ctx, 3, retry.Exponential(), func() error { return cl.inner.P2PAPI().DisconnectPeer(cl.ctx, peerInfo.PeerID) @@ -463,16 +444,11 @@ func (cl *L2CLNode) disconnectPeerRaw(peer *L2CLNode) { cl.require.NoError(err, "failed to disconnect peer") } +// ConnectPeer one-shot dials peer. Restart-survivability is handled in sysgo +// via the peer registry. func (cl *L2CLNode) ConnectPeer(peer *L2CLNode) { - cl.managedPeers[peer.Name()] = peer - peer.managedPeers[cl.Name()] = cl - cl.connectPeerRaw(peer) -} - -func (cl *L2CLNode) connectPeerRaw(peer *L2CLNode) { peerInfo := peer.PeerInfo() cl.require.NotZero(len(peerInfo.Addresses), "failed to get peer address") - // graceful backoff for p2p connection, to avoid dial backoff or connection refused error strategy := &retry.ExponentialStrategy{Min: 10 * time.Second, Max: 30 * time.Second, MaxJitter: 250 * time.Millisecond} err := retry.Do0(cl.ctx, 5, strategy, func() error { return cl.inner.P2PAPI().ConnectPeer(cl.ctx, peerInfo.Addresses[0]) diff --git a/op-devstack/dsl/l2_el.go b/op-devstack/dsl/l2_el.go index 006eda3810b..1882dd12e9c 100644 --- a/op-devstack/dsl/l2_el.go +++ b/op-devstack/dsl/l2_el.go @@ -359,6 +359,20 @@ func (el *L2ELNode) Start() { lifecycle.Start() } +// WipeOnDiskState wipes any persistent state belonging to the EL between a +// Stop and Start. No-op for in-memory ELs. +func (el *L2ELNode) WipeOnDiskState() { + wiper, ok := el.inner.(stack.OnDiskStateWiper) + if !ok { + return + } + el.log.Info("Wiping on-disk state", "name", el.inner.Name()) + err := wiper.WipeOnDiskState() + el.require.NoErrorf(err, "failed to wipe on-disk state for %s", el.inner.Name()) +} + +// PeerWith one-shot-dials peer. Static-topology restart survival is handled +// in sysgo via the peer registry, so this is just a thin convenience. func (el *L2ELNode) PeerWith(peer *L2ELNode) { sysgo.ConnectP2P(el.ctx, el.require, el.inner.L2EthClient().RPC(), peer.inner.L2EthClient().RPC(), false) } diff --git a/op-devstack/dsl/supernode.go b/op-devstack/dsl/supernode.go index b7354ce2717..775957097ea 100644 --- a/op-devstack/dsl/supernode.go +++ b/op-devstack/dsl/supernode.go @@ -17,6 +17,15 @@ type Supernode struct { commonImpl inner stack.Supernode testControl stack.SupernodeTestControl + frontedELs []*L2ELNode +} + +// AttachELs records the L2 ELs this supernode drives. Used by +// RestartWithFreshDataDir(WithELWiped) to wipe their on-disk state +// alongside the supernode's, and available to future operations that +// need to address the supernode's ELs as a group. +func (s *Supernode) AttachELs(els []*L2ELNode) { + s.frontedELs = els } // NewSupernode creates a new Supernode DSL wrapper @@ -129,16 +138,48 @@ func (s *Supernode) ResumeInterop() { s.interopActivity().Resume() } +// RestartOpts controls optional behaviour of Supernode.RestartWithFreshDataDir. +type RestartOpts struct { + // WipeELs, when true, additionally stops and wipes every supernode- + // fronted EL alongside the supernode data dir, forcing post-restart + // execution-layer sync from peer ELs. + WipeELs bool +} + +// WithELWiped is a RestartWithFreshDataDir option that also wipes every +// supernode-fronted EL's on-disk state. Requires AttachELs. +func WithELWiped(o *RestartOpts) { o.WipeELs = true } + // RestartWithFreshDataDir stops the supernode, deletes its on-disk data // directory in full, and starts a fresh supernode against the same chain -// containers, virtual nodes, and externally-visible RPC address. -// Requires NewSupernodeWithTestControl. -func (s *Supernode) RestartWithFreshDataDir() { +// containers, virtual nodes, and externally-visible RPC address. With +// WithELWiped, every fronted EL is stopped, wiped, and restarted between +// the supernode stop and start so the post-restart VN must +// execution-layer-sync from peer ELs. Each EL's Start re-dials its +// registered static peers; fronted CL static peers are re-dialed after the +// supernode comes back up. Requires NewSupernodeWithTestControl, plus +// AttachELs when WipeELs is set. +func (s *Supernode) RestartWithFreshDataDir(opts ...func(*RestartOpts)) { s.require.NotNil(s.testControl, "RestartWithFreshDataDir requires test control; use NewSupernodeWithTestControl") - s.log.Info("restarting supernode with fresh data dir") - err := s.testControl.RestartWithFreshDataDir() - s.require.NoError(err, "failed to restart supernode with fresh data dir") + + o := RestartOpts{} + for _, fn := range opts { + fn(&o) + } + + s.log.Info("restarting supernode with fresh data dir", "wipe_els", o.WipeELs) + s.testControl.Stop() + if o.WipeELs { + for _, el := range s.frontedELs { + el.Stop() + el.WipeOnDiskState() + } + for _, el := range s.frontedELs { + el.Start() + } + } + s.require.NoError(s.testControl.StartWithFreshDataDir(), "start supernode fresh") } // BackfillAttempts returns the number of log-backfill attempts since the @@ -188,6 +229,23 @@ func (s *Supernode) VerificationStartTimestamp() uint64 { return s.interopActivity().VerificationStartTimestamp() } +// AwaitVerificationStartsAtOrAfter is the >= variant of +// AwaitVerificationStartsAt, for cases where the exact handoff timestamp +// depends on which safeDB entry is recorded first after a cold start. +func (s *Supernode) AwaitVerificationStartsAtOrAfter(minExpected uint64) { + ia := s.interopActivity() + ctx, cancel := context.WithTimeout(s.ctx, 3*DefaultTimeout) + defer cancel() + err := wait.For(ctx, 500*time.Millisecond, func() (bool, error) { + return ia.BackfillCompleted(), nil + }) + s.require.NoError(err, "cold-start initialization did not complete in time") + actual := ia.VerificationStartTimestamp() + s.require.GreaterOrEqualf(actual, minExpected, + "verificationStartTimestamp must be >= %d after cold-start init, got %d", + minExpected, actual) +} + // AwaitVerificationStartsAt blocks until cold-start init completes, then // asserts VerificationStartTimestamp equals expected. // Requires NewSupernodeWithTestControl. diff --git a/op-devstack/presets/singlechain_from_runtime.go b/op-devstack/presets/singlechain_from_runtime.go index 9731df1b558..3b488f43302 100644 --- a/op-devstack/presets/singlechain_from_runtime.go +++ b/op-devstack/presets/singlechain_from_runtime.go @@ -84,9 +84,6 @@ func singleChainMultiNodeFromRuntime(t devtest.T, runtime *sysgo.SingleChainRunt L2ELB: dsl.NewL2ELNode(l2ELB), L2CLB: dsl.NewL2CLNode(l2CLB), } - if runtime.P2PEnabled { - preset.L2CLB.ManagePeer(preset.L2CL) - } if runSyncChecks { // Ensure the follower node is in sync with the sequencer before starting tests. // CrossSafe requires derivation to run, which under ELSync can only begin @@ -210,8 +207,6 @@ func singleChainTwoVerifiersFromRuntime(t devtest.T, runtime *sysgo.SingleChainR L2CLC: dsl.NewL2CLNode(l2CLC), TestSequencer: dsl.NewTestSequencer(testSequencer), } - preset.L2CLC.ManagePeer(preset.L2CL) - preset.L2CLC.ManagePeer(preset.L2CLB) return preset } @@ -254,7 +249,6 @@ func simpleWithSyncTesterFromRuntime(t devtest.T, runtime *sysgo.SingleChainRunt SyncTesterL2EL: dsl.NewL2ELNode(syncTesterL2EL), L2CL2: dsl.NewL2CLNode(l2CL2), } - preset.L2CL2.ManagePeer(preset.L2CL) return preset } diff --git a/op-devstack/presets/twol2_from_runtime.go b/op-devstack/presets/twol2_from_runtime.go index 68865b2ee3d..a6c9bb6c7af 100644 --- a/op-devstack/presets/twol2_from_runtime.go +++ b/op-devstack/presets/twol2_from_runtime.go @@ -148,6 +148,7 @@ func twoL2SupernodeInteropFromRuntime(t devtest.T, runtime *sysgo.MultiChainRunt } preset.FunderA = dsl.NewFunder(preset.Wallet, preset.FaucetA, preset.L2ELA) preset.FunderB = dsl.NewFunder(preset.Wallet, preset.FaucetB, preset.L2ELB) + preset.Supernode.AttachELs([]*dsl.L2ELNode{preset.L2ELA, preset.L2ELB}) return preset } diff --git a/op-devstack/presets/twol2_peer_el.go b/op-devstack/presets/twol2_peer_el.go new file mode 100644 index 00000000000..9ef77377164 --- /dev/null +++ b/op-devstack/presets/twol2_peer_el.go @@ -0,0 +1,28 @@ +package presets + +import ( + "github.com/ethereum-optimism/optimism/op-devstack/devtest" + "github.com/ethereum-optimism/optimism/op-devstack/dsl" + "github.com/ethereum-optimism/optimism/op-devstack/sysgo" +) + +// TwoL2SupernodeInteropPeerEL extends TwoL2SupernodeInterop with a second +// supernode that follows both chains as a verifier. The embedded fields +// (Supernode, L2ELA, L2ELB, …) describe the sequencer side and continue +// to drive block production. VerifierSupernode + VerifierL2ELA/B are the +// wipe targets: the verifier VNs run in NonSequencer/ELSync mode and +// resync over EL devp2p from the sequencer ELs after a wipe. +type TwoL2SupernodeInteropPeerEL struct { + TwoL2SupernodeInterop + + VerifierSupernode *dsl.Supernode + VerifierL2ELA *dsl.L2ELNode + VerifierL2ELB *dsl.L2ELNode + VerifierL2ACL *dsl.L2CLNode + VerifierL2BCL *dsl.L2CLNode +} + +func NewTwoL2SupernodeInteropPeerEL(t devtest.T, delaySeconds uint64, opts ...Option) *TwoL2SupernodeInteropPeerEL { + presetCfg, _ := collectSupportedPresetConfig(t, "NewTwoL2SupernodeInteropPeerEL", opts, twoL2SupernodeInteropPresetSupportedOptionKinds) + return twoL2SupernodeInteropPeerELFromRuntime(t, sysgo.NewTwoL2SupernodeInteropPeerELRuntimeWithConfig(t, delaySeconds, presetCfg)) +} diff --git a/op-devstack/presets/twol2_peer_el_from_runtime.go b/op-devstack/presets/twol2_peer_el_from_runtime.go new file mode 100644 index 00000000000..37ddd995038 --- /dev/null +++ b/op-devstack/presets/twol2_peer_el_from_runtime.go @@ -0,0 +1,52 @@ +package presets + +import ( + "github.com/ethereum-optimism/optimism/op-devstack/devtest" + "github.com/ethereum-optimism/optimism/op-devstack/dsl" + "github.com/ethereum-optimism/optimism/op-devstack/sysgo" +) + +func twoL2SupernodeInteropPeerELFromRuntime(t devtest.T, runtime *sysgo.MultiChainRuntime) *TwoL2SupernodeInteropPeerEL { + base := twoL2SupernodeInteropFromRuntime(t, runtime) + + t.Require().NotNil(runtime.VerifierSupernode, "verifier supernode missing from runtime") + chainA := runtime.Chains["l2a"] + chainB := runtime.Chains["l2b"] + t.Require().NotNil(chainA.VerifierEL, "verifier EL missing on l2a") + t.Require().NotNil(chainB.VerifierEL, "verifier EL missing on l2b") + + verifierA := addVerifierNode(t, base.L2A, chainA) + verifierB := addVerifierNode(t, base.L2B, chainB) + + verifierSupernode := newSupernodeFrontend(t, "supernode-two-l2-verifier", runtime.VerifierSupernode.UserRPC()) + verifier := dsl.NewSupernodeWithTestControl(verifierSupernode, runtime.VerifierSupernode) + verifier.AttachELs([]*dsl.L2ELNode{verifierA.el, verifierB.el}) + + return &TwoL2SupernodeInteropPeerEL{ + TwoL2SupernodeInterop: *base, + VerifierSupernode: verifier, + VerifierL2ELA: verifierA.el, + VerifierL2ELB: verifierB.el, + VerifierL2ACL: verifierA.cl, + VerifierL2BCL: verifierB.cl, + } +} + +type verifierNode struct { + el *dsl.L2ELNode + cl *dsl.L2CLNode +} + +func addVerifierNode(t devtest.T, l2Net *dsl.L2Network, chain *sysgo.MultiChainNodeRuntime) verifierNode { + chainID := chain.Network.ChainID() + el := newL2ELFrontend(t, "verifier", chainID, chain.VerifierEL.UserRPC(), chain.VerifierEL.EngineRPC(), chain.VerifierEL.JWTPath(), chain.Network.RollupConfig(), chain.VerifierEL) + cl := newL2CLFrontend(t, "verifier", chainID, chain.VerifierCL.UserRPC(), chain.VerifierCL) + cl.attachEL(el) + + net, ok := l2Net.Escape().(*presetL2Network) + t.Require().True(ok, "expected preset L2 network") + net.AddL2ELNode(el) + net.AddL2CLNode(cl) + + return verifierNode{el: dsl.NewL2ELNode(el), cl: dsl.NewL2CLNode(cl)} +} diff --git a/op-devstack/stack/lifecycle.go b/op-devstack/stack/lifecycle.go index 3aee741c34d..3a6c16519f0 100644 --- a/op-devstack/stack/lifecycle.go +++ b/op-devstack/stack/lifecycle.go @@ -4,3 +4,10 @@ type Lifecycle interface { Start() Stop() } + +// OnDiskStateWiper is an optional interface implemented by components that +// persist state on disk. Test helpers call WipeOnDiskState between Stop and +// Start to force a genuine cold-start restart. +type OnDiskStateWiper interface { + WipeOnDiskState() error +} diff --git a/op-devstack/stack/supernode.go b/op-devstack/stack/supernode.go index 15c7da235ac..9b0b8bc2d34 100644 --- a/op-devstack/stack/supernode.go +++ b/op-devstack/stack/supernode.go @@ -16,11 +16,15 @@ type Supernode interface { type SupernodeTestControl interface { // InteropActivity returns the current interop activity, or nil if the // supernode is stopped or interop is not configured. Do not cache the - // pointer across RestartWithFreshDataDir. + // pointer across StartWithFreshDataDir. InteropActivity() *interop.Interop - // RestartWithFreshDataDir stops the supernode, deletes its on-disk - // data directory, and starts a fresh supernode against the same chain - // containers, virtual nodes, and externally-visible RPC address. - RestartWithFreshDataDir() error + // Stop stops the supernode without touching its data dir, leaving the + // externally-visible RPC address in place so peer components can be + // wiped between Stop and StartWithFreshDataDir. + Stop() + + // StartWithFreshDataDir wipes the supernode's data dir and starts it + // against the same VNs and RPC address. Pairs with Stop. + StartWithFreshDataDir() error } diff --git a/op-devstack/sysgo/l2_cl_supernode.go b/op-devstack/sysgo/l2_cl_supernode.go index 797a4607d70..434ac53430f 100644 --- a/op-devstack/sysgo/l2_cl_supernode.go +++ b/op-devstack/sysgo/l2_cl_supernode.go @@ -19,10 +19,13 @@ import ( "github.com/ethereum-optimism/optimism/op-supernode/supernode/activity/interop" ) -var errSupernodeNotRunning = errors.New("sysgo: supernode is not running") - type SuperNode struct { - mu sync.Mutex + mu sync.Mutex + peerRegistry + // Per-chain RPC routes to wait on after (re)start before considering the + // supernode ready for peer-connector replay. Populated when a + // SuperNodeProxy is attached. + routes []string sn *supernode.Supernode cancel context.CancelFunc httpProxy *tcpproxy.Proxy @@ -91,6 +94,13 @@ func (n *SuperNode) startLocked() { addr, err := n.sn.WaitRPCAddr(ctx) n.p.Require().NoError(err, "supernode failed to bind RPC address") n.httpProxy.SetUpstream(ProxyAddr(n.p.Require(), "http://"+addr)) + + for _, route := range n.routes { + waitForSupernodeRoute(n.p, n.logger, route) + } + for _, connect := range n.snapshotConnectors() { + connect() + } } func (n *SuperNode) Stop() { @@ -118,7 +128,7 @@ func (n *SuperNode) stopLocked() { // InteropActivity returns the interop activity, or nil if the supernode is // stopped or has no interop activity. The pointer is bound to the current -// instance; do not cache across RestartWithFreshDataDir. Test-only. +// instance; do not cache across StartWithFreshDataDir. Test-only. func (n *SuperNode) InteropActivity() *interop.Interop { n.mu.Lock() defer n.mu.Unlock() @@ -128,20 +138,23 @@ func (n *SuperNode) InteropActivity() *interop.Interop { return n.sn.InteropActivity() } -// RestartWithFreshDataDir stops the supernode, deletes its on-disk data -// directory, and starts a fresh supernode against the same chain -// containers, virtual nodes, and externally-visible RPC address. Test-only. -func (n *SuperNode) RestartWithFreshDataDir() error { +// StartWithFreshDataDir wipes the data dir and starts a fresh supernode. +// Pairs with Stop. Test-only. +func (n *SuperNode) StartWithFreshDataDir() error { n.mu.Lock() defer n.mu.Unlock() - if n.sn == nil { - return errSupernodeNotRunning + if n.sn != nil { + return errors.New("sysgo: StartWithFreshDataDir called while supernode is still running") } + return n.wipeDataDirAndStartLocked() +} + +// wipeDataDirAndStartLocked must be called with n.mu held and n.sn nil. +func (n *SuperNode) wipeDataDirAndStartLocked() error { if n.snCfg == nil || n.snCfg.DataDir == "" { - return errors.New("sysgo: RestartWithFreshDataDir requires a configured supernode DataDir") + return errors.New("sysgo: fresh data dir restart requires a configured supernode DataDir") } - n.logger.Info("restarting supernode with fresh data dir", "data_dir", n.snCfg.DataDir) - n.stopLocked() + n.logger.Info("wiping supernode data dir", "data_dir", n.snCfg.DataDir) if err := os.RemoveAll(n.snCfg.DataDir); err != nil { return fmt.Errorf("sysgo: wipe supernode data dir %s: %w", n.snCfg.DataDir, err) } @@ -159,9 +172,15 @@ type SuperNodeProxy struct { userRPC string interopEndpoint string interopJwtSecret eth.Bytes32 + + // superNode is the underlying supernode that owns this proxy's RPC route. + // Peer connectors registered on the proxy are forwarded so they replay + // when the supernode restarts. + superNode *SuperNode } var _ L2CLNode = (*SuperNodeProxy)(nil) +var _ PeerRegistrar = (*SuperNodeProxy)(nil) func (n *SuperNodeProxy) Start() {} func (n *SuperNodeProxy) Stop() {} @@ -170,6 +189,25 @@ func (n *SuperNodeProxy) InteropRPC() (endpoint string, jwtSecret eth.Bytes32) { return n.interopEndpoint, n.interopJwtSecret } +func (n *SuperNodeProxy) RegisterPeerConnector(connect func()) { + n.superNode.attachRoute(n.userRPC) + n.superNode.RegisterPeerConnector(connect) +} + +// attachRoute records a per-chain RPC route that startLocked must wait on +// after (re)start, so a peer-connector replay never fires before the +// supernode is actually serving the chain. +func (n *SuperNode) attachRoute(rpcEndpoint string) { + n.mu.Lock() + defer n.mu.Unlock() + for _, existing := range n.routes { + if existing == rpcEndpoint { + return + } + } + n.routes = append(n.routes, rpcEndpoint) +} + // SupernodeConfig holds configuration options for the shared supernode. type SupernodeConfig struct { // InteropActivationTimestamp enables the interop activity at the given timestamp. diff --git a/op-devstack/sysgo/l2_el_opgeth.go b/op-devstack/sysgo/l2_el_opgeth.go index 44702840950..400f6e1ba18 100644 --- a/op-devstack/sysgo/l2_el_opgeth.go +++ b/op-devstack/sysgo/l2_el_opgeth.go @@ -19,6 +19,7 @@ import ( type OpGeth struct { mu sync.Mutex + peerRegistry p devtest.CommonT logger log.Logger @@ -128,6 +129,10 @@ func (n *OpGeth) Start() { n.l2Geth = l2Geth n.authProxy.SetUpstream(ProxyAddr(require, l2Geth.AuthRPC().RPC())) n.userProxy.SetUpstream(ProxyAddr(require, l2Geth.UserRPC().RPC())) + + for _, connect := range n.snapshotConnectors() { + connect() + } } func (n *OpGeth) Stop() { diff --git a/op-devstack/sysgo/l2_el_opreth.go b/op-devstack/sysgo/l2_el_opreth.go index 06663a20ec1..d5a8ccff2b4 100644 --- a/op-devstack/sysgo/l2_el_opreth.go +++ b/op-devstack/sysgo/l2_el_opreth.go @@ -3,6 +3,8 @@ package sysgo import ( "fmt" "net/url" + "os" + "os/exec" "strings" "sync" @@ -78,6 +80,7 @@ func OpRethWithSupervisorURL(supervisorURL string) OpRethOption { type OpReth struct { mu sync.Mutex + peerRegistry name string chainID eth.ChainID @@ -94,6 +97,12 @@ type OpReth struct { // Each entry is of the form "key=value". env []string + // On-disk state — tracked so tests can wipe and re-init before restart. + dataDirPath string + chainConfigPath string + proofHistoryDir string + proofStorageVer string + p devtest.T sub *SubProcess @@ -186,18 +195,70 @@ func (n *OpReth) Start() { n.userProxy.SetUpstream(ProxyAddr(n.p.Require(), userRPCAddr)) n.authProxy.SetUpstream(ProxyAddr(n.p.Require(), authRPCAddr)) + + for _, connect := range n.snapshotConnectors() { + connect() + } } -// Stop stops the op-reth node. -// warning: no restarts supported yet, since the RPC port is not remembered. +// Stop stops the op-reth node. The user/auth RPC proxy addresses survive so +// Start may be called again to bring the process back up. func (n *OpReth) Stop() { n.mu.Lock() defer n.mu.Unlock() + if n.sub == nil { + return + } err := n.sub.Stop(true) n.p.Require().NoError(err, "Must stop") n.sub = nil } +// initStorage runs op-reth's `init` and (when configured) `proofs init` +// against the node's data dirs. Used at first start and after WipeOnDiskState. +func (n *OpReth) initStorage() error { + if out, err := exec.Command(n.execPath, "init", "--datadir="+n.dataDirPath, "--chain="+n.chainConfigPath).CombinedOutput(); err != nil { + return fmt.Errorf("op-reth %s: init: %w: %s", n.name, err, string(out)) + } + if n.proofHistoryDir != "" && n.proofStorageVer != "" { + out, err := exec.Command(n.execPath, "proofs", "init", + "--datadir="+n.dataDirPath, + "--chain="+n.chainConfigPath, + "--proofs-history.storage-path="+n.proofHistoryDir, + "--proofs-history.storage-version="+n.proofStorageVer, + ).CombinedOutput() + if err != nil { + return fmt.Errorf("op-reth %s: proofs init: %w: %s", n.name, err, string(out)) + } + } + return nil +} + +// WipeOnDiskState removes and re-initialises the op-reth data dir and +// proof-history dir. Callers must Stop the node first. +func (n *OpReth) WipeOnDiskState() error { + n.mu.Lock() + defer n.mu.Unlock() + if n.sub != nil { + return fmt.Errorf("op-reth %s: cannot wipe while running", n.name) + } + if n.dataDirPath == "" || n.chainConfigPath == "" { + return fmt.Errorf("op-reth %s: data dir not tracked", n.name) + } + if err := os.RemoveAll(n.dataDirPath); err != nil { + return fmt.Errorf("op-reth %s: remove datadir: %w", n.name, err) + } + if err := os.MkdirAll(n.dataDirPath, 0o755); err != nil { + return fmt.Errorf("op-reth %s: recreate datadir: %w", n.name, err) + } + if n.proofHistoryDir != "" { + if err := os.RemoveAll(n.proofHistoryDir); err != nil { + return fmt.Errorf("op-reth %s: remove proof history: %w", n.name, err) + } + } + return n.initStorage() +} + func (n *OpReth) UserRPC() string { return n.userRPC } diff --git a/op-devstack/sysgo/mixed_runtime.go b/op-devstack/sysgo/mixed_runtime.go index b1ceea5bdeb..2ccb358fe85 100644 --- a/op-devstack/sysgo/mixed_runtime.go +++ b/op-devstack/sysgo/mixed_runtime.go @@ -5,7 +5,6 @@ import ( "encoding/hex" "encoding/json" "os" - "os/exec" "path/filepath" "strings" @@ -346,27 +345,8 @@ func buildMixedOpRethNode( args = append(args, "--metrics=127.0.0.1:0") } - initArgs := []string{ - "init", - "--datadir=" + dataDirPath, - "--chain=" + chainConfigPath, - } - err = exec.Command(execPath, initArgs...).Run() - t.Require().NoError(err, "must init op-reth node") - proofHistoryDir := filepath.Join(tempDir, "proof-history") - initProofsArgs := []string{ - "proofs", - "init", - "--datadir=" + dataDirPath, - "--chain=" + chainConfigPath, - "--proofs-history.storage-path=" + proofHistoryDir, - "--proofs-history.storage-version=" + storageVersion, - } - initOut, initErr := exec.Command(execPath, initProofsArgs...).CombinedOutput() - t.Require().NoError(initErr, "must init op-reth proof history: %s", string(initOut)) - args = append( args, "--proofs-history", @@ -379,7 +359,7 @@ func buildMixedOpRethNode( OpRethOptionBundle(opts).Apply(t, NewComponentTarget(key, l2Net.ChainID()), opRethCfg) args = append(args, opRethCfg.ExtraArgs...) - return &OpReth{ + node := &OpReth{ name: key, chainID: l2Net.ChainID(), jwtPath: jwtPath, @@ -389,9 +369,15 @@ func buildMixedOpRethNode( execPath: execPath, args: args, env: []string{}, + dataDirPath: dataDirPath, + chainConfigPath: chainConfigPath, + proofHistoryDir: proofHistoryDir, + proofStorageVer: storageVersion, p: t, l2MetricsRegistrar: metricsRegistrar, } + t.Require().NoError(node.initStorage(), "must init op-reth storage") + return node } func startMixedOpRethNode( diff --git a/op-devstack/sysgo/multichain_supernode_runtime.go b/op-devstack/sysgo/multichain_supernode_runtime.go index dfbfefdd538..37c8d25651b 100644 --- a/op-devstack/sysgo/multichain_supernode_runtime.go +++ b/op-devstack/sysgo/multichain_supernode_runtime.go @@ -90,6 +90,17 @@ func NewTwoL2SupernodeFollowL2RuntimeWithConfig(t devtest.T, delaySeconds uint64 return runtime } +// NewTwoL2SupernodeInteropPeerELRuntimeWithConfig builds the base interop +// runtime (sequencer supernode driving both chains) and adds a second +// supernode in NonSequencer/ELSync mode that follows both chains as a +// verifier. The verifier supernode and its ELs can be wiped without +// halting block production on the sequencer supernode. +func NewTwoL2SupernodeInteropPeerELRuntimeWithConfig(t devtest.T, delaySeconds uint64, cfg PresetConfig) *MultiChainRuntime { + runtime := NewTwoL2SupernodeInteropRuntimeWithConfig(t, delaySeconds, cfg) + addVerifierSupernode(t, runtime, cfg) + return runtime +} + func newTwoL2SupernodeRuntime(t devtest.T, enableInterop bool, delaySeconds uint64) (*MultiChainRuntime, uint64) { return newTwoL2SupernodeRuntimeWithConfig(t, enableInterop, delaySeconds, PresetConfig{}) } @@ -279,15 +290,17 @@ func newTwoL2SupernodeRuntimeWithConfig(t devtest.T, enableInterop bool, delaySe require.NoError(err, "failed to override message expiry window") } - supernode, l2ACL, l2BCL := startTwoL2SharedSupernode( + supernode, l2ACL, l2BCL := startTwoL2SharedSupernodeWithMode( t, l1Net, l1EL, l1CL, l2ANet, l2AEL, + supernodeVNMode{}, l2BNet, l2BEL, + supernodeVNMode{}, depSet, interopActivationTimestamp, cfg.InteropLogBackfillDepth, @@ -458,15 +471,81 @@ func addMultiChainFollowL2Node(t devtest.T, runtime *MultiChainRuntime, chainKey return node } -func startTwoL2SharedSupernode( +// addVerifierSupernode starts a second supernode that follows both chains +// as a NonSequencer ELSync verifier, with its own EL per chain. Each +// verifier EL peers bidi over EL devp2p with the chain's sequencer EL so +// payloads flow into the verifier without engaging the verifier's CL +// reqresp. The two supernodes' per-chain VNs are CL-peered so the verifier +// hears head announcements and can drive snap sync. +func addVerifierSupernode(t devtest.T, runtime *MultiChainRuntime, cfg PresetConfig) { + chainA := runtime.Chains["l2a"] + chainB := runtime.Chains["l2b"] + t.Require().NotNil(chainA, "missing l2a runtime chain") + t.Require().NotNil(chainB, "missing l2b runtime chain") + t.Require().NotNil(runtime.Supernode, "verifier supernode requires a sequencer supernode") + + depSetStatic, ok := runtime.DependencySet.(*depset.StaticConfigDependencySet) + t.Require().True(ok, "verifier supernode requires static dependency set") + + jwtPath := chainA.EL.JWTPath() + jwtSecret := readJWTSecretFromPath(t, jwtPath) + + verAEL := startL2ELForKey(t, chainA.Network, jwtPath, jwtSecret, "verifier", NewELNodeIdentity(0)) + verBEL := startL2ELForKey(t, chainB.Network, jwtPath, jwtSecret, "verifier", NewELNodeIdentity(0)) + + verifierMode := supernodeVNMode{ + NonSequencer: true, + SyncMode: nodeSync.ELSync, + DisableReqRespSync: true, + NoDiscovery: true, + } + + verifierSN, verACL, verBCL := startTwoL2SharedSupernodeWithMode( + t, + runtime.L1Network, + runtime.L1EL, + runtime.L1CL, + chainA.Network, verAEL, verifierMode, + chainB.Network, verBEL, verifierMode, + depSetStatic, + runtime.Supernode.snCfg.InteropActivationTimestamp, + cfg.InteropLogBackfillDepth, + jwtSecret, + ) + + connectL2ELPeersBidi(t, t.Logger(), verAEL, chainA.EL, true) + connectL2ELPeersBidi(t, t.Logger(), verBEL, chainB.EL, true) + connectL2CLPeers(t, t.Logger(), verACL, chainA.CL) + connectL2CLPeers(t, t.Logger(), verBCL, chainB.CL) + + runtime.VerifierSupernode = verifierSN + chainA.VerifierEL = verAEL + chainA.VerifierCL = verACL + chainB.VerifierEL = verBEL + chainB.VerifierCL = verBCL +} + +// supernodeVNMode overrides the per-VN sequencing/sync configuration. The +// zero value matches the default in-process VN (sequencer, CL-sync, reqresp +// on, discovery on). +type supernodeVNMode struct { + NonSequencer bool + SyncMode nodeSync.Mode + DisableReqRespSync bool + NoDiscovery bool +} + +func startTwoL2SharedSupernodeWithMode( t devtest.T, l1Net *L1Network, l1EL *L1Geth, l1CL *L1CLNode, l2ANet *L2Network, l2AEL L2ELNode, + l2AMode supernodeVNMode, l2BNet *L2Network, l2BEL L2ELNode, + l2BMode supernodeVNMode, depSet *depset.StaticConfigDependencySet, interopActivationTimestamp *uint64, interopLogBackfillDepth time.Duration, @@ -474,17 +553,31 @@ func startTwoL2SharedSupernode( ) (*SuperNode, *SuperNodeProxy, *SuperNodeProxy) { require := t.Require() logger := t.Logger().New("component", "supernode") - makeNodeCfg := func(l2Net *L2Network, l2EL L2ELNode) *opnodeconfig.Config { - p2pKey, err := l2Net.keys.Secret(devkeys.SequencerP2PRole.Key(l2Net.ChainID().ToBig())) - require.NoError(err, "need p2p key for supernode virtual sequencer") + makeNodeCfg := func(l2Net *L2Network, l2EL L2ELNode, mode supernodeVNMode) *opnodeconfig.Config { + sequencerEnabled := !mode.NonSequencer + enableReqRespSync := !mode.DisableReqRespSync + sequencerKeyHex := "" + if sequencerEnabled { + p2pKey, err := l2Net.keys.Secret(devkeys.SequencerP2PRole.Key(l2Net.ChainID().ToBig())) + require.NoError(err, "need p2p key for supernode virtual sequencer") + sequencerKeyHex = hex.EncodeToString(crypto.FromECDSA(p2pKey)) + } p2pConfig, p2pSignerSetup := newDevstackP2PConfig( t, logger.New("chain_id", l2Net.ChainID().String(), "component", "supernode-p2p"), l2Net.rollupCfg.BlockTime, - false, - true, - hex.EncodeToString(crypto.FromECDSA(p2pKey)), + mode.NoDiscovery, + enableReqRespSync, + sequencerKeyHex, ) + syncMode := mode.SyncMode + if syncMode == 0 { + syncMode = nodeSync.CLSync + } + syncCfg := nodeSync.Config{ + SyncMode: syncMode, + SyncModeReqResp: enableReqRespSync, + } cfg := &opnodeconfig.Config{ L1: &opnodeconfig.L1EndpointConfig{ L1NodeAddr: l1EL.UserRPC(), @@ -503,7 +596,7 @@ func startTwoL2SharedSupernode( }, DependencySet: depSet, Beacon: &opnodeconfig.L1BeaconEndpointConfig{BeaconAddr: l1CL.beaconHTTPAddr}, - Driver: driver.Config{SequencerEnabled: true, SequencerConfDepth: 2}, + Driver: driver.Config{SequencerEnabled: sequencerEnabled, SequencerConfDepth: 2}, Rollup: *l2Net.rollupCfg, P2PSigner: p2pSignerSetup, RPC: oprpc.CLIConfig{ListenAddr: "127.0.0.1", ListenPort: 0, EnableAdmin: true}, @@ -511,7 +604,7 @@ func startTwoL2SharedSupernode( P2P: p2pConfig, L1EpochPollInterval: 2 * time.Second, RuntimeConfigReloadInterval: 0, - Sync: nodeSync.Config{SyncMode: nodeSync.CLSync, SyncModeReqResp: true}, + Sync: syncCfg, ConfigPersistence: opnodeconfig.DisabledConfigPersistence{}, Metrics: opmetrics.CLIConfig{}, Pprof: oppprof.CLIConfig{}, @@ -523,8 +616,8 @@ func startTwoL2SharedSupernode( } vnCfgs := map[eth.ChainID]*opnodeconfig.Config{ - l2ANet.ChainID(): makeNodeCfg(l2ANet, l2AEL), - l2BNet.ChainID(): makeNodeCfg(l2BNet, l2BEL), + l2ANet.ChainID(): makeNodeCfg(l2ANet, l2AEL, l2AMode), + l2BNet.ChainID(): makeNodeCfg(l2BNet, l2BEL, l2BMode), } chainIDs := []uint64{eth.EvilChainIDToUInt64(l2ANet.ChainID()), eth.EvilChainIDToUInt64(l2BNet.ChainID())} @@ -567,6 +660,7 @@ func startTwoL2SharedSupernode( userRPC: l2ARPC, interopEndpoint: l2ARPC, interopJwtSecret: jwtSecret, + superNode: supernode, } l2BCL := &SuperNodeProxy{ p: t, @@ -574,6 +668,7 @@ func startTwoL2SharedSupernode( userRPC: l2BRPC, interopEndpoint: l2BRPC, interopJwtSecret: jwtSecret, + superNode: supernode, } return supernode, l2ACL, l2BCL @@ -681,10 +776,16 @@ func startSingleChainSharedSupernode( userRPC: l2RPC, interopEndpoint: l2RPC, interopJwtSecret: jwtSecret, + superNode: supernode, } } -func waitForSupernodeRoute(t devtest.T, logger log.Logger, rpcEndpoint string) { +// waitForSupernodeRoute polls a per-chain supernode route until it answers +// both optimism_rollupConfig (the route is registered) and opp2p_self with +// at least one listener address (the in-process VN's p2p host is fully up). +// Returning before opp2p_self stabilises produces stale-address peer dials +// after a restart. +func waitForSupernodeRoute(t devtest.CommonT, logger log.Logger, rpcEndpoint string) { deadline := time.Now().Add(15 * time.Second) for { if time.Now().After(deadline) { @@ -693,10 +794,14 @@ func waitForSupernodeRoute(t devtest.T, logger log.Logger, rpcEndpoint string) { rpcCl, err := client.NewRPC(t.Ctx(), logger, rpcEndpoint, client.WithLazyDial()) if err == nil { - var out any - callErr := rpcCl.CallContext(t.Ctx(), &out, "optimism_rollupConfig") + var rollupOut any + rollupErr := rpcCl.CallContext(t.Ctx(), &rollupOut, "optimism_rollupConfig") + var self struct { + Addresses []string `json:"addresses"` + } + selfErr := rpcCl.CallContext(t.Ctx(), &self, "opp2p_self") rpcCl.Close() - if callErr == nil { + if rollupErr == nil && selfErr == nil && len(self.Addresses) > 0 { return } } diff --git a/op-devstack/sysgo/peer_registry.go b/op-devstack/sysgo/peer_registry.go new file mode 100644 index 00000000000..7875a1a8f52 --- /dev/null +++ b/op-devstack/sysgo/peer_registry.go @@ -0,0 +1,30 @@ +package sysgo + +import "sync" + +// PeerRegistrar lets connect helpers record a re-connect closure on a node so +// the same peering is re-established whenever the node (re)starts. Nodes that +// implement this MUST replay registered connectors once they finish coming up. +type PeerRegistrar interface { + RegisterPeerConnector(connect func()) +} + +// peerRegistry is the embeddable state behind RegisterPeerConnector. +type peerRegistry struct { + mu sync.Mutex + connectors []func() +} + +func (r *peerRegistry) RegisterPeerConnector(connect func()) { + r.mu.Lock() + r.connectors = append(r.connectors, connect) + r.mu.Unlock() +} + +func (r *peerRegistry) snapshotConnectors() []func() { + r.mu.Lock() + defer r.mu.Unlock() + out := make([]func(), len(r.connectors)) + copy(out, r.connectors) + return out +} diff --git a/op-devstack/sysgo/runtime_state.go b/op-devstack/sysgo/runtime_state.go index 60d7d8b49b4..0a73d2c46ff 100644 --- a/op-devstack/sysgo/runtime_state.go +++ b/op-devstack/sysgo/runtime_state.go @@ -122,13 +122,15 @@ func (r *SingleChainRuntime) VMConfig(t devtest.T, dir string) *vm.Config { } type MultiChainNodeRuntime struct { - Name string - Network *L2Network - EL L2ELNode - CL L2CLNode - Batcher *L2Batcher - Proposer *L2Proposer - Followers map[string]*SingleChainNodeRuntime + Name string + Network *L2Network + EL L2ELNode + CL L2CLNode + Batcher *L2Batcher + Proposer *L2Proposer + Followers map[string]*SingleChainNodeRuntime + VerifierEL L2ELNode + VerifierCL L2CLNode } type MultiChainRuntime struct { @@ -145,6 +147,7 @@ type MultiChainRuntime struct { PrimarySupervisor Supervisor Supernode *SuperNode + VerifierSupernode *SuperNode FaucetService *faucet.Service TimeTravel *clock.AdvancingClock diff --git a/op-devstack/sysgo/singlechain_build.go b/op-devstack/sysgo/singlechain_build.go index caa9c9742e4..7d287fdc255 100644 --- a/op-devstack/sysgo/singlechain_build.go +++ b/op-devstack/sysgo/singlechain_build.go @@ -236,33 +236,89 @@ func connectL2ELPeers(t devtest.T, logger log.Logger, initiatorRPC, acceptorRPC ConnectP2P(t.Ctx(), require, rpc1, rpc2, trusted) } +// connectL2ELPeersBidi calls admin_addPeer in both directions so small static +// EL-sync topologies don't depend on discovery. If both nodes implement +// PeerRegistrar, the bidi+trusted dial is also registered for replay on +// (re)start so the same topology survives wipes and restarts. +func connectL2ELPeersBidi(t devtest.T, logger log.Logger, a, b L2ELNode, trusted bool) { + dial := func() { + connectL2ELPeers(t, logger, a.UserRPC(), b.UserRPC(), trusted) + connectL2ELPeers(t, logger, b.UserRPC(), a.UserRPC(), trusted) + } + dial() + registerOnBoth(a, b, dial) +} + +// connectL2CLPeers bidi-connects two CL nodes and registers the dial for +// replay on (re)start of either side. func connectL2CLPeers(t devtest.T, logger log.Logger, l2CL1, l2CL2 L2CLNode) { + dial := func() { connectL2CLPeersOnce(t, logger, l2CL1, l2CL2) } + dial() + registerOnBoth(l2CL1, l2CL2, dial) +} + +func connectL2CLPeersOnce(t devtest.T, logger log.Logger, l2CL1, l2CL2 L2CLNode) { require := t.Require() ctx := t.Ctx() - p := getP2PClientsAndPeers(ctx, logger, require, l2CL1, l2CL2) - - connectPeer := func(p2pClient *sources.P2PClient, multiAddress string) { + // Refresh peer info on each retry and try every advertised address: a + // just-restarted in-process op-node can report stale listener addresses + // alongside the current one, so picking only Addresses[0] would target a + // dead port. + connectFromTo := func(from, to L2CLNode) { err := retry.Do0(ctx, 6, retry.Exponential(), func() error { - return p2pClient.ConnectPeer(ctx, multiAddress) + fromClient, err := GetP2PClient(ctx, logger, from) + if err != nil { + return err + } + toClient, err := GetP2PClient(ctx, logger, to) + if err != nil { + return err + } + toInfo, err := GetPeerInfo(ctx, toClient) + if err != nil { + return err + } + if len(toInfo.Addresses) == 0 { + return fmt.Errorf("no peer addresses for %s", toInfo.PeerID) + } + var lastErr error + for _, addr := range toInfo.Addresses { + if dialErr := fromClient.ConnectPeer(ctx, addr); dialErr == nil { + return nil + } else { + lastErr = dialErr + } + } + return lastErr }) require.NoError(err, "failed to connect L2CL peer") } + connectFromTo(l2CL1, l2CL2) + connectFromTo(l2CL2, l2CL1) - connectPeer(p.client1, p.peerInfo2.Addresses[0]) - connectPeer(p.client2, p.peerInfo1.Addresses[0]) - + p := getP2PClientsAndPeers(ctx, logger, require, l2CL1, l2CL2) peerDump1, err := GetPeers(ctx, p.client1) require.NoError(err) peerDump2, err := GetPeers(ctx, p.client2) require.NoError(err) - _, ok1 := peerDump1.Peers[p.peerInfo2.PeerID.String()] require.True(ok1, "peer register invalid (cl1 missing cl2)") _, ok2 := peerDump2.Peers[p.peerInfo1.PeerID.String()] require.True(ok2, "peer register invalid (cl2 missing cl1)") } +// registerOnBoth registers connect with whichever of a and b implement +// PeerRegistrar. Stand-alone nodes that don't need replay can ignore it. +func registerOnBoth[T any](a, b T, connect func()) { + if r, ok := any(a).(PeerRegistrar); ok { + r.RegisterPeerConnector(connect) + } + if r, ok := any(b).(PeerRegistrar); ok { + r.RegisterPeerConnector(connect) + } +} + func startSequencerCL( t devtest.T, keys devkeys.Keys,