diff --git a/Cargo.lock b/Cargo.lock index 7c9deec99..0afa4a4d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3636,6 +3636,7 @@ dependencies = [ name = "magicblock-chainlink" version = "0.2.3" dependencies = [ + "arc-swap", "assert_matches", "async-trait", "bincode", @@ -3647,6 +3648,7 @@ dependencies = [ "magicblock-core", "magicblock-delegation-program", "magicblock-magic-program-api", + "magicblock-metrics", "serde_json", "solana-account", "solana-account-decoder", @@ -3872,6 +3874,7 @@ dependencies = [ "solana-feature-set", "solana-fee", "solana-fee-structure", + "solana-keypair", "solana-loader-v4-program", "solana-program", "solana-program-runtime", @@ -3932,6 +3935,7 @@ version = "0.2.3" dependencies = [ "ed25519-dalek", "log", + "magicblock-metrics", "magicblock-rpc-client", "rand 0.8.5", "sha3", @@ -6233,7 +6237,7 @@ dependencies = [ [[package]] name = "solana-account" version = "2.2.1" -source = "git+https://github.com/magicblock-labs/solana-account.git?rev=f454d4a#f454d4a67a1ca64b87002025868f5369428e1c54" +source = "git+https://github.com/magicblock-labs/solana-account.git?rev=731fa50#731fa5037bf89929da76759f2281c1cb4833a8b7" dependencies = [ "bincode", "qualifier_attr", diff --git a/Cargo.toml b/Cargo.toml index b7c9686d8..8f7b05070 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -91,7 +91,7 @@ jsonrpc-pubsub = "18.0.0" jsonrpc-ws-server = "18.0.0" lazy_static = "1.4.0" libc = "0.2.153" -log = { version = "0.4.20", features = ["release_max_level_info"] } +log = { version = "0.4.20" } lru = "0.16.0" macrotest = "1" magic-domain-program = { git = "https://github.com/magicblock-labs/magic-domain-program.git", rev = "ea04d46", default-features = false } @@ -151,7 +151,7 @@ serde = "1.0.217" serde_derive = "1.0" serde_json = "1.0" sha3 = "0.10.8" -solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "f454d4a" } +solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "731fa50" } solana-account-decoder = { version = "2.2" } solana-accounts-db = { version = "2.2" } solana-account-decoder-client-types = { version = "2.2" } @@ -227,6 +227,6 @@ features = ["dev-context-only-utils"] # some solana dependencies have solana-storage-proto as dependency # we need to patch them with our version, because they use protobuf-src v1.1.0 # and we use protobuf-src v2.1.1. Otherwise compilation fails -solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "f454d4a" } +solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "731fa50" } solana-storage-proto = { path = "./storage-proto" } solana-svm = { git = "https://github.com/magicblock-labs/magicblock-svm.git", rev = "11bbaf2" } diff --git a/magicblock-account-cloner/src/lib.rs b/magicblock-account-cloner/src/lib.rs index 311eb787d..28021a237 100644 --- a/magicblock-account-cloner/src/lib.rs +++ b/magicblock-account-cloner/src/lib.rs @@ -173,12 +173,15 @@ impl ChainlinkCloner { // Create and initialize the program account in retracted state // and then deploy it and finally set the authority to match the // one on chain + let slot = self.accounts_db.slot(); let DeployableV4Program { pre_deploy_loader_state, deploy_instruction, post_deploy_loader_state, - } = program - .try_into_deploy_data_and_ixs_v4(validator_kp.pubkey())?; + } = program.try_into_deploy_data_and_ixs_v4( + slot, + validator_kp.pubkey(), + )?; let lamports = Rent::default() .minimum_balance(pre_deploy_loader_state.len()); diff --git a/magicblock-accounts-db/src/lib.rs b/magicblock-accounts-db/src/lib.rs index 8f714fe72..74b516c20 100644 --- a/magicblock-accounts-db/src/lib.rs +++ b/magicblock-accounts-db/src/lib.rs @@ -1,4 +1,4 @@ -use std::{path::Path, sync::Arc}; +use std::{collections::HashSet, path::Path, sync::Arc}; use error::AccountsDbError; use index::{ @@ -356,7 +356,7 @@ impl AccountsBank for AccountsDb { .iter_all() .filter(|(pk, acc)| predicate(pk, acc)) .map(|(pk, _)| pk) - .collect::>(); + .collect::>(); let removed = to_remove.len(); for pk in to_remove { self.remove_account(&pk); diff --git a/magicblock-aperture/src/requests/http/mod.rs b/magicblock-aperture/src/requests/http/mod.rs index 4c1897edd..ea599e999 100644 --- a/magicblock-aperture/src/requests/http/mod.rs +++ b/magicblock-aperture/src/requests/http/mod.rs @@ -112,7 +112,7 @@ impl HttpDispatcher { .inspect_err(|e| { // There is nothing we can do if fetching the account fails // Log the error and return whatever is in the accounts db - warn!("Failed to ensure account {pubkey}: {e}"); + debug!("Failed to ensure account {pubkey}: {e}"); }); self.accountsdb.get_account(pubkey) } diff --git a/magicblock-aperture/src/requests/http/send_transaction.rs b/magicblock-aperture/src/requests/http/send_transaction.rs index 67f1c446c..9bf1f7012 100644 --- a/magicblock-aperture/src/requests/http/send_transaction.rs +++ b/magicblock-aperture/src/requests/http/send_transaction.rs @@ -1,4 +1,4 @@ -use log::{debug, trace}; +use log::*; use magicblock_metrics::metrics::{ TRANSACTION_PROCESSING_TIME, TRANSACTION_SKIP_PREFLIGHT, }; diff --git a/magicblock-aperture/src/tests.rs b/magicblock-aperture/src/tests.rs index 8d49c818c..643fbb737 100644 --- a/magicblock-aperture/src/tests.rs +++ b/magicblock-aperture/src/tests.rs @@ -42,6 +42,7 @@ fn chainlink(accounts_db: &Arc) -> ChainlinkImpl { None, Pubkey::new_unique(), Pubkey::new_unique(), + 0, ) .expect("Failed to create Chainlink") } diff --git a/magicblock-aperture/tests/setup.rs b/magicblock-aperture/tests/setup.rs index decfacf9d..6160f75e0 100644 --- a/magicblock-aperture/tests/setup.rs +++ b/magicblock-aperture/tests/setup.rs @@ -62,6 +62,7 @@ fn chainlink(accounts_db: &Arc) -> Arc { None, Pubkey::new_unique(), Pubkey::new_unique(), + 0, ) .expect("Failed to create Chainlink"), ) diff --git a/magicblock-api/src/magic_validator.rs b/magicblock-api/src/magic_validator.rs index df797f988..f418dd405 100644 --- a/magicblock-api/src/magic_validator.rs +++ b/magicblock-api/src/magic_validator.rs @@ -441,6 +441,7 @@ impl MagicValidator { validator_pubkey, faucet_pubkey, chainlink_config, + config.accounts.clone.auto_airdrop_lamports, ) .await?; diff --git a/magicblock-api/src/tickers.rs b/magicblock-api/src/tickers.rs index 339ba5de5..d1df1e993 100644 --- a/magicblock-api/src/tickers.rs +++ b/magicblock-api/src/tickers.rs @@ -101,7 +101,6 @@ async fn handle_scheduled_commits( error!("Failed to process scheduled commits: {:?}", err); } } - #[allow(unused_variables)] pub fn init_system_metrics_ticker( tick_duration: Duration, diff --git a/magicblock-chainlink/Cargo.toml b/magicblock-chainlink/Cargo.toml index 71e7eb0f4..ea342b02f 100644 --- a/magicblock-chainlink/Cargo.toml +++ b/magicblock-chainlink/Cargo.toml @@ -4,6 +4,7 @@ version.workspace = true edition.workspace = true [dependencies] +arc-swap = "1.7" async-trait = { workspace = true } bincode = { workspace = true } env_logger = { workspace = true } @@ -12,7 +13,8 @@ log = { workspace = true } lru = { workspace = true } magicblock-core = { workspace = true } magicblock-magic-program-api = { workspace = true } -magicblock-delegation-program = { workspace = true } +magicblock-metrics = { workspace = true } + magicblock-delegation-program = { workspace = true } serde_json = { workspace = true } solana-account = { workspace = true } solana-account-decoder = { workspace = true } diff --git a/magicblock-chainlink/src/chainlink/blacklisted_accounts.rs b/magicblock-chainlink/src/chainlink/blacklisted_accounts.rs index f596c6ad4..51d06f73b 100644 --- a/magicblock-chainlink/src/chainlink/blacklisted_accounts.rs +++ b/magicblock-chainlink/src/chainlink/blacklisted_accounts.rs @@ -11,7 +11,6 @@ pub fn blacklisted_accounts( // want to take a dependency on that crate just for this ID which won't change const NATIVE_SOL_ID: Pubkey = solana_sdk::pubkey!("So11111111111111111111111111111111111111112"); - let mut blacklisted_accounts = sysvar_accounts() .into_iter() .chain(native_program_accounts()) @@ -48,6 +47,9 @@ pub fn sysvar_accounts() -> HashSet { } pub fn native_program_accounts() -> HashSet { + const NATIVE_TOKEN_PROGRAM_ID: Pubkey = + solana_sdk::pubkey!("TokenkegQfeZyiNwAJbNbGKPFXCWuBvf9Ss623VQ5DA"); + let mut blacklisted_programs = HashSet::new(); blacklisted_programs.insert(solana_sdk::address_lookup_table::program::ID); blacklisted_programs.insert(solana_sdk::bpf_loader::ID); @@ -63,5 +65,6 @@ pub fn native_program_accounts() -> HashSet { blacklisted_programs.insert(solana_sdk::stake::program::ID); blacklisted_programs.insert(solana_sdk::system_program::ID); blacklisted_programs.insert(solana_sdk::vote::program::ID); + blacklisted_programs.insert(NATIVE_TOKEN_PROGRAM_ID); blacklisted_programs } diff --git a/magicblock-chainlink/src/chainlink/errors.rs b/magicblock-chainlink/src/chainlink/errors.rs index 5e0d44771..09e9c4cce 100644 --- a/magicblock-chainlink/src/chainlink/errors.rs +++ b/magicblock-chainlink/src/chainlink/errors.rs @@ -18,7 +18,7 @@ pub enum ChainlinkError { #[error("Cloner error: {0}")] ClonerError(#[from] crate::cloner::errors::ClonerError), - #[error("Delegation could not be decoded: {0} ({1:?})")] + #[error("Delegation record could not be decoded: {0} ({1:?})")] InvalidDelegationRecord(Pubkey, ProgramError), #[error("Failed to resolve one or more accounts {0} when getting delegation records")] diff --git a/magicblock-chainlink/src/chainlink/fetch_cloner.rs b/magicblock-chainlink/src/chainlink/fetch_cloner.rs index 6216ea5fd..ea5cfa441 100644 --- a/magicblock-chainlink/src/chainlink/fetch_cloner.rs +++ b/magicblock-chainlink/src/chainlink/fetch_cloner.rs @@ -14,6 +14,7 @@ use log::*; use magicblock_core::traits::AccountsBank; use solana_account::{AccountSharedData, ReadableAccount}; use solana_pubkey::Pubkey; +use solana_sdk::system_program; use tokio::{ sync::{mpsc, oneshot}, task, @@ -179,7 +180,7 @@ where let resolved_account = self.resolve_account_to_clone_from_forwarded_sub_with_unsubscribe(update) .await; - if let Some(account) = resolved_account { + if let Some(mut account) = resolved_account { // Ensure that the subscription update isn't out of order, i.e. we don't already // hold a newer version of the account in our bank let out_of_order_slot = self @@ -215,6 +216,30 @@ where ); } } + // Check if this is an undelegation completion + // Conditions: + // 1. In bank: account is delegated + // 2. In bank: owner is dlp::id() indicating undelegation was triggered + // 3. In update: owner is not dlp::id() + // NOTE: this check will be simpler once we have the `undelegating` flag + if let Some(in_bank) = + self.accounts_bank.get_account(&pubkey) + { + if in_bank.delegated() + && in_bank.owner().eq(&dlp::id()) + && !account.owner().eq(&dlp::id()) + { + debug!( + "Undelegation completed for account: {pubkey}" + ); + magicblock_metrics::metrics::inc_undelegation_completed(); + } + } + + // When cloning from subscription update, reset undelegating flag + // since the subscription update reflects current chain state + account.set_undelegating(false); + if account.executable() { self.handle_executable_sub_update(pubkey, account) .await; @@ -341,14 +366,11 @@ where let account = if let Some(delegation_record) = delegation_record { - let delegation_record = match DelegationRecord::try_from_bytes_with_discriminator( + let delegation_record = + match Self::parse_delegation_record( delegation_record.data(), - ).map_err(|err| { - ChainlinkError::InvalidDelegationRecord( - delegation_record_pubkey, - err, - ) - }) { + delegation_record_pubkey, + ) { Ok(x) => Some(x), Err(err) => { error!("Failed to parse delegation record for {pubkey}: {err}. Not cloning account."); @@ -428,6 +450,83 @@ where } } + /// Parses a delegation record from account data bytes. + /// Returns the parsed DelegationRecord, or InvalidDelegationRecord error + /// if parsing fails. + fn parse_delegation_record( + data: &[u8], + delegation_record_pubkey: Pubkey, + ) -> ChainlinkResult { + DelegationRecord::try_from_bytes_with_discriminator(data) + .copied() + .map_err(|err| { + ChainlinkError::InvalidDelegationRecord( + delegation_record_pubkey, + err, + ) + }) + } + + /// Fetches and parses the delegation record for an account, returning the + /// parsed DelegationRecord if found and valid, None otherwise. + async fn fetch_and_parse_delegation_record( + &self, + account_pubkey: Pubkey, + min_context_slot: u64, + ) -> Option { + let delegation_record_pubkey = + delegation_record_pda_from_delegated_account(&account_pubkey); + + match self + .remote_account_provider + .try_get_multi_until_slots_match( + &[delegation_record_pubkey], + Some(MatchSlotsConfig { + min_context_slot: Some(min_context_slot), + ..Default::default() + }), + ) + .await + { + Ok(mut delegation_records) => { + if let Some(delegation_record_remote) = delegation_records.pop() + { + match delegation_record_remote.fresh_account() { + Some(delegation_record_account) => { + Self::parse_delegation_record( + delegation_record_account.data(), + delegation_record_pubkey, + ) + .ok() + } + None => None, + } + } else { + None + } + } + Err(_) => None, + } + } + + /// Checks if an account marked as undelegating is still delegated to our + /// validator. If not, returns false to indicate the account should be + /// refetched from chain. If still delegated to us, returns true to indicate + /// the bank version should be used. + async fn is_still_delegated_to_us(&self, pubkey: Pubkey) -> bool { + let min_context_slot = self.remote_account_provider.chain_slot(); + match self + .fetch_and_parse_delegation_record(pubkey, min_context_slot) + .await + { + Some(delegation_record) => { + delegation_record.authority.eq(&self.validator_pubkey) + || delegation_record.authority.eq(&Pubkey::default()) + } + None => false, + } + } + /// Tries to fetch all accounts in `pubkeys` and clone them into the bank. /// If `mark_empty` is provided, accounts in that list that are /// not found on chain will be added with zero lamports to the bank. @@ -581,7 +680,7 @@ where // For accounts we couldn't find we cannot do anything. We will let code depending // on them to be in the bank fail on its own if !not_found.is_empty() { - debug!( + trace!( "Could not find accounts on chain: {:?}", not_found .iter() @@ -612,6 +711,46 @@ where ); } + // For accounts in the bank that are marked as undelegating, check if they're still + // delegated to us. If not, we need to refetch them from chain instead of using the + // bank version. + let mut accounts_to_refetch = vec![]; + for (pubkey, slot) in &in_bank { + if let Some(bank_account) = self.accounts_bank.get_account(pubkey) { + if bank_account.undelegating() { + // Check if still delegated to us + if !self.is_still_delegated_to_us(*pubkey).await { + debug!( + "Account {pubkey} marked as undelegating is no longer delegated to us, refetching from chain" + ); + accounts_to_refetch.push((*pubkey, *slot)); + } + } + } + } + + // Remove accounts that need to be refetched from in_bank list + let _in_bank: Vec<_> = in_bank + .into_iter() + .filter(|(pubkey, _)| { + !accounts_to_refetch.iter().any(|(p, _)| p == pubkey) + }) + .collect(); + + // Add accounts that need to be refetched to the plain list + // (they will be fetched from chain) + let mut plain = plain; + for (pubkey, _slot) in accounts_to_refetch { + if let Some(account) = self + .remote_account_provider + .try_get(pubkey) + .await? + .fresh_account() + { + plain.push((pubkey, account)); + } + } + // Calculate min context slot: use the greater of subscription slot or last chain slot let min_context_slot = slot.map(|subscription_slot| { subscription_slot.max(self.remote_account_provider.chain_slot()) @@ -694,34 +833,35 @@ where // If the account is delegated we set the owner and delegation state if let Some(delegation_record_data) = delegation_record { - let delegation_record = match - DelegationRecord::try_from_bytes_with_discriminator( - delegation_record_data.data(), - ) - // NOTE: failing here is fine when resolving all accounts for a transaction - // since if something is off we better not run it anyways - // However we may consider a different behavior when user is getting - // mutliple accounts. - .map_err(|err| { - ChainlinkError::InvalidDelegationRecord( - delegation_record_pubkey, - err, + // NOTE: failing here is fine when resolving all accounts for a transaction + // since if something is off we better not run it anyways + // However we may consider a different behavior when user is getting + // mutliple accounts. + let delegation_record = match Self::parse_delegation_record( + delegation_record_data.data(), + delegation_record_pubkey, + ) { + Ok(x) => x, + Err(err) => { + // Cancel all new subs since we won't clone any accounts + cancel_subs( + &self.remote_account_provider, + CancelStrategy::New { + new_subs: pubkeys + .iter() + .cloned() + .chain(record_subs.iter().cloned()) + .collect(), + existing_subs: existing_subs + .into_iter() + .cloned() + .collect(), + }, ) - }) { - Ok(x) => x, - Err(err) => { - // Cancel all new subs since we won't clone any accounts - cancel_subs( - &self.remote_account_provider, - CancelStrategy::New { - new_subs: pubkeys.iter().cloned().chain(record_subs.iter().cloned()).collect(), - existing_subs: existing_subs.into_iter().cloned().collect(), - }, - ) - .await; - return Err(err); - } - }; + .await; + return Err(err); + } + }; trace!("Delegation record found for {pubkey}: {delegation_record:?}"); let is_delegated_to_us = delegation_record @@ -960,24 +1100,37 @@ where .lock() .expect("pending_requests lock poisoned"); - for &pubkey in pubkeys { - // Check synchronously if account is in bank - if self.accounts_bank.get_account(&pubkey).is_some() { - // Account is already in bank, we can skip it as it will be handled - // by the existing fetch_and_clone_accounts logic when needed - continue; + for pubkey in pubkeys { + // Check synchronously if account is in bank and subscribed when it should be + if let Some(account_in_bank) = + self.accounts_bank.get_account(pubkey) + { + // NOTE: we defensively correct accounts that we should have been watching but + // were not for some reason. We fetch them again in that case. + // This actually would point to a bug in the subscription logic. + // TODO(thlorenz): remove this once we are certain (by perusing logs) that this + // does not happen anymore + if account_in_bank.owner().eq(&dlp::id()) + || account_in_bank.delegated() + || self.blacklisted_accounts.contains(pubkey) + || self.is_watching(pubkey) + { + continue; + } else if !self.is_watching(pubkey) { + debug!("Account {pubkey} should be watched but wasn't"); + } } // Check if account fetch is already pending - if let Some(requests) = pending.get_mut(&pubkey) { + if let Some(requests) = pending.get_mut(pubkey) { let (sender, receiver) = oneshot::channel(); requests.push(sender); - await_pending.push((pubkey, receiver)); + await_pending.push((*pubkey, receiver)); continue; } // Account needs to be fetched - add to fetch list - fetch_new.push(pubkey); + fetch_new.push(*pubkey); } // Create pending entries for accounts we need to fetch @@ -1024,9 +1177,14 @@ where // Wait for any pending requests to complete let mut joinset = JoinSet::new(); - for (_, receiver) in await_pending { + for (pubkey, receiver) in await_pending { joinset.spawn(async move { - if let Err(err) = receiver.await { + if let Err(err) = receiver + .await + .inspect_err(|err| { + warn!("FetchCloner::clone_accounts - RecvError occurred while awaiting account {}: {err:?}. This indicates the account fetch sender was dropped without sending a value.", pubkey); + }) + { // The sender was dropped, likely due to an error in the other request error!( "Failed to receive account from pending request: {err}" @@ -1192,6 +1350,32 @@ where ) -> ChainlinkResult> { Ok(self.remote_account_provider.try_get_removed_account_rx()?) } + + /// Best-effort airdrop helper: if the account doesn't exist in the bank or has 0 lamports, + /// create/overwrite it as a plain system account with the provided lamports using the cloner path. + pub async fn airdrop_account_if_empty( + &self, + pubkey: Pubkey, + lamports: u64, + ) -> ClonerResult<()> { + if lamports == 0 { + return Ok(()); + } + if let Some(acc) = self.accounts_bank.get_account(&pubkey) { + if acc.lamports() > 0 { + return Ok(()); + } + } + // Build a plain system account with the requested balance + let account = + AccountSharedData::new(lamports, 0, &system_program::id()); + debug!( + "Auto-airdropping {} lamports to new/empty account {}", + lamports, pubkey + ); + let _sig = self.cloner.clone_account(pubkey, account).await?; + Ok(()) + } } // ----------------- @@ -1499,9 +1683,12 @@ mod tests { rpc_client, pubsub_client, forward_tx, - &RemoteAccountProviderConfig::default_with_lifecycle_mode( + &RemoteAccountProviderConfig::try_new_with_metrics( + 1000, LifecycleMode::Ephemeral, - ), + false, + ) + .unwrap(), ) .await .unwrap(), diff --git a/magicblock-chainlink/src/chainlink/mod.rs b/magicblock-chainlink/src/chainlink/mod.rs index 7d0fd7795..5b8606e3a 100644 --- a/magicblock-chainlink/src/chainlink/mod.rs +++ b/magicblock-chainlink/src/chainlink/mod.rs @@ -1,4 +1,7 @@ -use std::sync::Arc; +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; use dlp::pda::ephemeral_balance_pda_from_payer; use errors::ChainlinkResult; @@ -50,6 +53,9 @@ pub struct Chainlink< validator_id: Pubkey, faucet_id: Pubkey, + + /// If > 0, automatically airdrop this many lamports to feepayers when they are new/empty + auto_airdrop_lamports: u64, } impl @@ -60,6 +66,7 @@ impl fetch_cloner: Option>>, validator_pubkey: Pubkey, faucet_pubkey: Pubkey, + auto_airdrop_lamports: u64, ) -> ChainlinkResult { let removed_accounts_sub = if let Some(fetch_cloner) = &fetch_cloner { let removed_accounts_rx = @@ -77,9 +84,11 @@ impl removed_accounts_sub, validator_id: validator_pubkey, faucet_id: faucet_pubkey, + auto_airdrop_lamports, }) } + #[allow(clippy::too_many_arguments)] pub async fn try_new_from_endpoints( endpoints: &[Endpoint], commitment: CommitmentConfig, @@ -88,6 +97,7 @@ impl validator_pubkey: Pubkey, faucet_pubkey: Pubkey, config: ChainlinkConfig, + auto_airdrop_lamports: u64, ) -> ChainlinkResult< Chainlink< ChainRpcClientImpl, @@ -126,6 +136,7 @@ impl fetch_cloner, validator_pubkey, faucet_pubkey, + auto_airdrop_lamports, ) } @@ -136,15 +147,56 @@ impl pub fn reset_accounts_bank(&self) { let blacklisted_accounts = blacklisted_accounts(&self.validator_id, &self.faucet_id); + + let delegated = AtomicU64::new(0); + let dlp_owned_not_delegated = AtomicU64::new(0); + let blacklisted = AtomicU64::new(0); + let remaining = AtomicU64::new(0); + let remaining_empty = AtomicU64::new(0); + let removed = self.accounts_bank.remove_where(|pubkey, account| { - (!account.delegated() - // This fixes the edge-case of accounts that were in the process of - // being undelegated but never completed while the validator was running - || account.owner().eq(&dlp::id())) - && !blacklisted_accounts.contains(pubkey) + if blacklisted_accounts.contains(pubkey) { + blacklisted.fetch_add(1, Ordering::Relaxed); + return false; + } + // TODO: this potentially looses data and is a temporary measure + if account.owner().eq(&dlp::id()) { + dlp_owned_not_delegated.fetch_add(1, Ordering::Relaxed); + return true; + } + if account.delegated() { + delegated.fetch_add(1, Ordering::Relaxed); + return false; + } + trace!( + "Removing non-delegated, non-DLP-owned account: {pubkey} {:#?}", + account + ); + remaining.fetch_add(1, Ordering::Relaxed); + if account.lamports() == 0 + && account.owner().ne(&solana_sdk::feature::id()) + { + remaining_empty.fetch_add(1, Ordering::Relaxed); + } + true }); - debug!("Removed {removed} non-delegated accounts"); + let non_empty = remaining + .load(Ordering::Relaxed) + .saturating_sub(remaining_empty.load(Ordering::Relaxed)); + + info!( + "Removed {removed} accounts from bank: +{} DLP-owned non-delegated +{} non-delegated non-blacklisted, no-feature non-empty. +{} non-delegated non-blacklisted empty +Kept: {} delegated, {} blacklisted", + dlp_owned_not_delegated.into_inner(), + non_empty, + remaining_empty.into_inner(), + delegated.into_inner(), + blacklisted.into_inner() + ); } fn subscribe_account_removals( @@ -204,18 +256,48 @@ impl .is_none_or(|a| !a.delegated()) }; - let mark_empty_if_not_found = if clone_escrow { + // Always allow the fee payer to be treated as empty-if-not-found so that + // transactions can still be processed in gasless mode + let mut mark_empty_if_not_found = vec![*feepayer]; + + if clone_escrow { let balance_pda = ephemeral_balance_pda_from_payer(feepayer, 0); trace!("Adding balance PDA {balance_pda} for feepayer {feepayer}"); pubkeys.push(balance_pda); - vec![balance_pda] - } else { - vec![] - }; + mark_empty_if_not_found.push(balance_pda); + } let mark_empty_if_not_found = (!mark_empty_if_not_found.is_empty()) .then(|| &mark_empty_if_not_found[..]); - self.ensure_accounts(&pubkeys, mark_empty_if_not_found) - .await + let res = self + .ensure_accounts(&pubkeys, mark_empty_if_not_found) + .await?; + + // Best-effort auto airdrop for fee payer if configured and still empty locally + if self.auto_airdrop_lamports > 0 { + if let Some(fetch_cloner) = self.fetch_cloner() { + let lamports = self + .accounts_bank + .get_account(feepayer) + .map(|a| a.lamports()) + .unwrap_or(0); + if lamports == 0 { + if let Err(err) = fetch_cloner + .airdrop_account_if_empty( + *feepayer, + self.auto_airdrop_lamports, + ) + .await + { + warn!( + "Auto airdrop for feepayer {} failed: {:?}", + feepayer, err + ); + } + } + } + } + + Ok(res) } /// Same as fetch accounts, but does not return the accounts, just @@ -283,7 +365,15 @@ impl .map(|p| p.to_string()) .collect::>() .join(", "); - trace!("Fetching accounts: {pubkeys_str}"); + let mark_empty_str = mark_empty_if_not_found + .map(|keys| { + keys.iter() + .map(|p| p.to_string()) + .collect::>() + .join(", ") + }) + .unwrap_or_default(); + trace!("Fetching accounts: {pubkeys_str}, mark_empty_if_not_found: {mark_empty_str}"); } Self::promote_accounts( fetch_cloner, @@ -311,7 +401,9 @@ impl &self, pubkey: Pubkey, ) -> ChainlinkResult<()> { - trace!("Undelegation requested for account: {pubkey}"); + debug!("Undelegation requested for account: {pubkey}"); + + magicblock_metrics::metrics::inc_undelegation_requested(); let Some(fetch_cloner) = self.fetch_cloner() else { return Ok(()); @@ -321,7 +413,7 @@ impl // once it's undelegated fetch_cloner.subscribe_to_account(&pubkey).await?; - trace!("Successfully subscribed to account {pubkey} for undelegation tracking"); + debug!("Successfully subscribed to account {pubkey} for undelegation tracking"); Ok(()) } diff --git a/magicblock-chainlink/src/remote_account_provider/chain_pubsub_actor.rs b/magicblock-chainlink/src/remote_account_provider/chain_pubsub_actor.rs index 030bf93bb..a8259f71e 100644 --- a/magicblock-chainlink/src/remote_account_provider/chain_pubsub_actor.rs +++ b/magicblock-chainlink/src/remote_account_provider/chain_pubsub_actor.rs @@ -1,22 +1,30 @@ use std::{ - collections::{HashMap, HashSet}, + collections::HashMap, fmt, - sync::{Arc, Mutex}, + sync::{ + atomic::{AtomicBool, AtomicU16, Ordering}, + Arc, Mutex, + }, }; use log::*; use solana_account_decoder_client_types::{UiAccount, UiAccountEncoding}; use solana_pubkey::Pubkey; -use solana_pubsub_client::nonblocking::pubsub_client::PubsubClient; use solana_rpc_client_api::{ config::RpcAccountInfoConfig, response::Response as RpcResponse, }; use solana_sdk::{commitment_config::CommitmentConfig, sysvar::clock}; -use tokio::sync::{mpsc, oneshot}; +use tokio::{ + sync::{mpsc, oneshot}, + time::Duration, +}; use tokio_stream::StreamExt; use tokio_util::sync::CancellationToken; -use super::errors::{RemoteAccountProviderError, RemoteAccountProviderResult}; +use super::{ + chain_pubsub_client::PubSubConnection, + errors::{RemoteAccountProviderError, RemoteAccountProviderResult}, +}; // Log every 10 secs (given chain slot time is 400ms) const CLOCK_LOG_SLOT_FREQ: u64 = 25; @@ -65,21 +73,25 @@ struct AccountSubscription { pub struct ChainPubsubActor { /// Configuration used to create the pubsub client pubsub_client_config: PubsubClientConfig, - /// Underlying pubsub client to connect to the chain - pubsub_client: Arc, + /// Underlying pubsub connection to connect to the chain + pubsub_connection: Arc, /// Sends subscribe/unsubscribe messages to this actor messages_sender: mpsc::Sender, /// Map of subscriptions we are holding subscriptions: Arc>>, /// Sends updates for any account subscription that is received via - /// the [Self::pubsub_client] + /// the [Self::pubsub_connection] subscription_updates_sender: mpsc::Sender, - /// The tasks that watch subscriptions via the [Self::pubsub_client] and - /// channel them into the [Self::subscription_updates_sender] - subscription_watchers: Arc>>, /// The token to use to cancel all subscriptions and shut down the /// message listener, essentially shutting down whis actor shutdown_token: CancellationToken, + /// Unique client ID for this actor instance used in logs + client_id: u16, + /// Indicates whether the actor is connected or has been disconnected due RPC to connection + /// issues + is_connected: Arc, + /// Channel used to signal connection issues to the submux + abort_sender: mpsc::Sender<()>, } #[derive(Debug)] @@ -92,7 +104,7 @@ pub enum ChainPubsubActorMessage { pubkey: Pubkey, response: oneshot::Sender>, }, - RecycleConnections { + Reconnect { response: oneshot::Sender>, }, } @@ -103,36 +115,40 @@ const MESSAGE_CHANNEL_SIZE: usize = 1_000; impl ChainPubsubActor { pub async fn new_from_url( pubsub_url: &str, + abort_sender: mpsc::Sender<()>, commitment: CommitmentConfig, ) -> RemoteAccountProviderResult<(Self, mpsc::Receiver)> { let config = PubsubClientConfig::from_url(pubsub_url, commitment); - Self::new(config).await + Self::new(abort_sender, config).await } pub async fn new( + abort_sender: mpsc::Sender<()>, pubsub_client_config: PubsubClientConfig, ) -> RemoteAccountProviderResult<(Self, mpsc::Receiver)> { - let pubsub_client = Arc::new( - PubsubClient::new(pubsub_client_config.pubsub_url.as_str()).await?, - ); + static CLIENT_ID: AtomicU16 = AtomicU16::new(0); + + let url = pubsub_client_config.pubsub_url.clone(); + let pubsub_connection = Arc::new(PubSubConnection::new(url).await?); let (subscription_updates_sender, subscription_updates_receiver) = mpsc::channel(SUBSCRIPTION_UPDATE_CHANNEL_SIZE); let (messages_sender, messages_receiver) = mpsc::channel(MESSAGE_CHANNEL_SIZE); - let subscription_watchers = - Arc::new(Mutex::new(tokio::task::JoinSet::new())); + let shutdown_token = CancellationToken::new(); let me = Self { pubsub_client_config, - pubsub_client, + pubsub_connection, messages_sender, subscriptions: Default::default(), subscription_updates_sender, - subscription_watchers, shutdown_token, + client_id: CLIENT_ID.fetch_add(1, Ordering::SeqCst), + is_connected: Arc::new(AtomicBool::new(true)), + abort_sender, }; me.start_worker(messages_receiver); @@ -142,7 +158,10 @@ impl ChainPubsubActor { } pub async fn shutdown(&self) { - info!("Shutting down ChainPubsubActor"); + info!( + "[client_id={}] Shutting down ChainPubsubActor", + self.client_id + ); let subs = self .subscriptions .lock() @@ -153,9 +172,34 @@ impl ChainPubsubActor { sub.cancellation_token.cancel(); } self.shutdown_token.cancel(); - // TODO: - // let mut subs = self.subscription_watchers.lock().unwrap();; - // subs.join_all().await; + } + + pub fn subscription_count(&self, filter: &[Pubkey]) -> usize { + if !self.is_connected.load(Ordering::SeqCst) { + return 0; + } + let subs = self + .subscriptions + .lock() + .expect("subscriptions lock poisoned"); + if filter.is_empty() { + subs.len() + } else { + subs.keys() + .filter(|pubkey| !filter.contains(pubkey)) + .count() + } + } + + pub fn subscriptions(&self) -> Vec { + if !self.is_connected.load(Ordering::SeqCst) { + return vec![]; + } + let subs = self + .subscriptions + .lock() + .expect("subscriptions lock poisoned"); + subs.keys().copied().collect() } pub async fn send_msg( @@ -175,23 +219,27 @@ impl ChainPubsubActor { mut messages_receiver: mpsc::Receiver, ) { let subs = self.subscriptions.clone(); - let subscription_watchers = self.subscription_watchers.clone(); let shutdown_token = self.shutdown_token.clone(); let pubsub_client_config = self.pubsub_client_config.clone(); let subscription_updates_sender = self.subscription_updates_sender.clone(); - let mut pubsub_client = self.pubsub_client.clone(); + let pubsub_connection = self.pubsub_connection.clone(); + let client_id = self.client_id; + let is_connected = self.is_connected.clone(); + let abort_sender = self.abort_sender.clone(); tokio::spawn(async move { loop { tokio::select! { msg = messages_receiver.recv() => { if let Some(msg) = msg { - pubsub_client = Self::handle_msg( + Self::handle_msg( subs.clone(), - pubsub_client.clone(), - subscription_watchers.clone(), + pubsub_connection.clone(), subscription_updates_sender.clone(), pubsub_client_config.clone(), + abort_sender.clone(), + client_id, + is_connected.clone(), msg ).await; } else { @@ -206,105 +254,152 @@ impl ChainPubsubActor { }); } + #[allow(clippy::too_many_arguments)] async fn handle_msg( subscriptions: Arc>>, - pubsub_client: Arc, - subscription_watchers: Arc>>, + pubsub_connection: Arc, subscription_updates_sender: mpsc::Sender, pubsub_client_config: PubsubClientConfig, + abort_sender: mpsc::Sender<()>, + client_id: u16, + is_connected: Arc, msg: ChainPubsubActorMessage, - ) -> Arc { + ) { + fn send_ok( + response: oneshot::Sender>, + client_id: u16, + ) { + let _ = response.send(Ok(())).inspect_err(|err| { + warn!( + "[client_id={client_id}] Failed to send msg ack: {err:?}" + ); + }); + } + match msg { ChainPubsubActorMessage::AccountSubscribe { pubkey, response } => { + if !is_connected.load(Ordering::SeqCst) { + trace!("[client_id={client_id}] Ignoring subscribe request for {pubkey} because disconnected"); + send_ok(response, client_id); + return; + } let commitment_config = pubsub_client_config.commitment_config; Self::add_sub( pubkey, response, subscriptions, - pubsub_client.clone(), - subscription_watchers, + pubsub_connection, subscription_updates_sender, + abort_sender, + is_connected, commitment_config, + client_id, ); - pubsub_client } ChainPubsubActorMessage::AccountUnsubscribe { pubkey, response, } => { + if !is_connected.load(Ordering::SeqCst) { + trace!("[client_id={client_id}] Ignoring unsubscribe request for {pubkey} because disconnected"); + send_ok(response, client_id); + return; + } if let Some(AccountSubscription { cancellation_token }) = - subscriptions.lock().unwrap().remove(&pubkey) + subscriptions + .lock() + .expect("subcriptions lock poisoned") + .get(&pubkey) { cancellation_token.cancel(); let _ = response.send(Ok(())); } else { - let _ = response + let _ = response .send(Err(RemoteAccountProviderError::AccountSubscriptionDoesNotExist( pubkey.to_string(), ))); } - pubsub_client } - ChainPubsubActorMessage::RecycleConnections { response } => { - match Self::recycle_connections( - subscriptions, - subscription_watchers, - subscription_updates_sender, + ChainPubsubActorMessage::Reconnect { response } => { + let result = Self::try_reconnect( + pubsub_connection, pubsub_client_config, + client_id, + is_connected, ) - .await - { - Ok(new_client) => { - let _ = response.send(Ok(())); - new_client - } - Err(err) => { - let _ = response.send(Err(err)); - pubsub_client - } - } + .await; + let _ = response.send(result); } } } + #[allow(clippy::too_many_arguments)] fn add_sub( pubkey: Pubkey, sub_response: oneshot::Sender>, subs: Arc>>, - pubsub_client: Arc, - subscription_watchers: Arc>>, + pubsub_connection: Arc, subscription_updates_sender: mpsc::Sender, + abort_sender: mpsc::Sender<()>, + is_connected: Arc, commitment_config: CommitmentConfig, + client_id: u16, ) { - trace!("Adding subscription for {pubkey} with commitment {commitment_config:?}"); + if subs + .lock() + .expect("subscriptions lock poisoned") + .contains_key(&pubkey) + { + trace!("[client_id={client_id}] Subscription for {pubkey} already exists, ignoring add_sub request"); + let _ = sub_response.send(Ok(())); + return; + } - let config = RpcAccountInfoConfig { - commitment: Some(commitment_config), - encoding: Some(UiAccountEncoding::Base64Zstd), - ..Default::default() - }; + trace!("[client_id={client_id}] Adding subscription for {pubkey} with commitment {commitment_config:?}"); let cancellation_token = CancellationToken::new(); - let mut sub_joinset = subscription_watchers.lock().unwrap(); - sub_joinset.spawn(async move { - // Attempt to subscribe to the account - let (mut update_stream, unsubscribe) = match pubsub_client - .account_subscribe(&pubkey, Some(config)) - .await { + // Insert into subscriptions HashMap immediately to prevent race condition + // with unsubscribe operations + // Assuming that messages to this actor are processed in the order they are sent + // then this eliminates the possibility of an unsubscribe being processed before + // the sub's cancellation token was added to the map + { + let mut subs_lock = + subs.lock().expect("subscriptions lock poisoned"); + subs_lock.insert( + pubkey, + AccountSubscription { + cancellation_token: cancellation_token.clone(), + }, + ); + } + + tokio::spawn(async move { + let config = RpcAccountInfoConfig { + commitment: Some(commitment_config), + encoding: Some(UiAccountEncoding::Base64Zstd), + ..Default::default() + }; + let (mut update_stream, unsubscribe) = match pubsub_connection + .account_subscribe(&pubkey, config.clone()) + .await + { Ok(res) => res, Err(err) => { - let _ = sub_response.send(Err(err.into())); + error!("[client_id={client_id}] Failed to subscribe to account {pubkey} {err:?}"); + Self::abort_and_signal_connection_issue( + client_id, + subs.clone(), + abort_sender, + is_connected.clone(), + ); + return; } }; - // Then track the subscription and confirm to the requester that the - // subscription was made - subs.lock().unwrap().insert(pubkey, AccountSubscription { - cancellation_token: cancellation_token.clone(), - }); - + // RPC succeeded - confirm to the requester that the subscription was made let _ = sub_response.send(Ok(())); // Now keep listening for updates and relay them to the @@ -312,106 +407,129 @@ impl ChainPubsubActor { loop { tokio::select! { _ = cancellation_token.cancelled() => { - debug!("Subscription for {pubkey} was cancelled"); - unsubscribe().await; + trace!("[client_id={client_id}] Subscription for {pubkey} was cancelled"); break; } update = update_stream.next() => { if let Some(rpc_response) = update { if log_enabled!(log::Level::Trace) && (!pubkey.eq(&clock::ID) || rpc_response.context.slot % CLOCK_LOG_SLOT_FREQ == 0) { - trace!("Received update for {pubkey}: {rpc_response:?}"); + trace!("[client_id={client_id}] Received update for {pubkey}: {rpc_response:?}"); } let _ = subscription_updates_sender.send(SubscriptionUpdate { pubkey, rpc_response, }).await.inspect_err(|err| { - error!("Failed to send {pubkey} subscription update: {err:?}"); + error!("[client_id={client_id}] Failed to send {pubkey} subscription update: {err:?}"); }); } else { - debug!("Subscription for {pubkey} ended by update stream"); - break; + debug!("[client_id={client_id}] Subscription for {pubkey} ended (EOF); signaling connection issue"); + Self::abort_and_signal_connection_issue( + client_id, + subs.clone(), + abort_sender.clone(), + is_connected.clone(), + ); + // Return early - abort_and_signal_connection_issue cancels all + // subscriptions, triggering cleanup via the cancellation path + // above. No need to run unsubscribe/cleanup here. + return; } } } } + + // Clean up subscription with timeout to prevent hanging on dead sockets + if tokio::time::timeout(Duration::from_secs(2), unsubscribe()) + .await + .is_err() + { + warn!( + "[client_id={client_id}] unsubscribe timed out for {pubkey}" + ); + } + subs.lock() + .expect("subscriptions lock poisoned") + .remove(&pubkey); }); } - async fn recycle_connections( - subscriptions: Arc>>, - subscription_watchers: Arc>>, - subscription_updates_sender: mpsc::Sender, + async fn try_reconnect( + pubsub_connection: Arc, pubsub_client_config: PubsubClientConfig, - ) -> RemoteAccountProviderResult> { - debug!("RecycleConnections: starting recycle process"); - - // 1. Recreate the pubsub client, in case that fails leave the old one in place - // as this is the best we can do - debug!( - "RecycleConnections: creating new PubsubClient for {}", - pubsub_client_config.pubsub_url - ); - let new_client = match PubsubClient::new( - pubsub_client_config.pubsub_url.as_str(), - ) - .await - { - Ok(c) => Arc::new(c), - Err(err) => { - error!("RecycleConnections: failed to create new PubsubClient: {err:?}"); - return Err(err.into()); - } + client_id: u16, + is_connected: Arc, + ) -> RemoteAccountProviderResult<()> { + // 1. Try to reconnect the pubsub connection + if let Err(err) = pubsub_connection.reconnect().await { + debug!("[client_id={}] failed to reconnect: {err:?}", client_id); + return Err(err.into()); + } + // Make a sub to any account and unsub immediately to verify connection + let pubkey = Pubkey::new_unique(); + let config = RpcAccountInfoConfig { + commitment: Some(pubsub_client_config.commitment_config), + encoding: Some(UiAccountEncoding::Base64Zstd), + ..Default::default() }; - // Cancel all current subscriptions and collect pubkeys to re-subscribe later + // 2. Try to subscribe to an account to verify connection + let (_, unsubscribe) = + match pubsub_connection.account_subscribe(&pubkey, config).await { + Ok(res) => res, + Err(err) => { + error!( + "[client_id={}] to verify connection via subscribe {err:?}", + client_id + ); + return Err(err.into()); + } + }; + + // 3. Unsubscribe immediately + unsubscribe().await; + + // 4. We are now connected again + is_connected.store(true, Ordering::SeqCst); + Ok(()) + } + + fn abort_and_signal_connection_issue( + client_id: u16, + subscriptions: Arc>>, + abort_sender: mpsc::Sender<()>, + is_connected: Arc, + ) { + // Only abort if we were connected; prevents duplicate aborts + if !is_connected.swap(false, Ordering::SeqCst) { + trace!( + "[client_id={client_id}] already disconnected, skipping abort" + ); + return; + } + + debug!("[client_id={client_id}] aborting"); + let drained = { let mut subs_lock = subscriptions.lock().unwrap(); std::mem::take(&mut *subs_lock) }; - let mut to_resubscribe = HashSet::new(); - for (pk, AccountSubscription { cancellation_token }) in drained { - to_resubscribe.insert(pk); + let drained_len = drained.len(); + for (_, AccountSubscription { cancellation_token }) in drained { cancellation_token.cancel(); } debug!( - "RecycleConnections: cancelled {} subscriptions", - to_resubscribe.len() + "[client_id={client_id}] canceled {} subscriptions", + drained_len ); - - // Abort and await all watcher tasks and add fresh joinset - debug!("RecycleConnections: aborting watcher tasks"); - let mut old_joinset = { - let mut watchers = subscription_watchers - .lock() - .expect("subscription_watchers lock poisonde"); - std::mem::replace(&mut *watchers, tokio::task::JoinSet::new()) - }; - old_joinset.abort_all(); - while let Some(_res) = old_joinset.join_next().await {} - debug!("RecycleConnections: watcher tasks terminated"); - - // Re-subscribe to all accounts - debug!( - "RecycleConnections: re-subscribing to {} accounts", - to_resubscribe.len() - ); - let commitment_config = pubsub_client_config.commitment_config; - for pk in to_resubscribe { - let (tx, _rx) = oneshot::channel(); - Self::add_sub( - pk, - tx, - subscriptions.clone(), - new_client.clone(), - subscription_watchers.clone(), - subscription_updates_sender.clone(), - commitment_config, - ); - } - - debug!("RecycleConnections: completed"); - - Ok(new_client) + // Use try_send to avoid blocking and naturally coalesce signals + let _ = abort_sender.try_send(()).inspect_err(|err| { + // Channel full is expected when reconnect is already in progress + if !matches!(err, mpsc::error::TrySendError::Full(_)) { + error!( + "[client_id={client_id}] failed to signal connection issue: {err:?}", + ) + } + }); } } diff --git a/magicblock-chainlink/src/remote_account_provider/chain_pubsub_client.rs b/magicblock-chainlink/src/remote_account_provider/chain_pubsub_client.rs index 7624ef752..719d12345 100644 --- a/magicblock-chainlink/src/remote_account_provider/chain_pubsub_client.rs +++ b/magicblock-chainlink/src/remote_account_provider/chain_pubsub_client.rs @@ -1,10 +1,24 @@ -use std::sync::{Arc, Mutex}; +use std::{ + mem, + sync::{Arc, Mutex}, + time::Duration, +}; +use arc_swap::ArcSwap; use async_trait::async_trait; +use futures_util::{future::BoxFuture, stream::BoxStream}; use log::*; +use solana_account_decoder::UiAccount; use solana_pubkey::Pubkey; +use solana_pubsub_client::nonblocking::pubsub_client::{ + PubsubClient, PubsubClientResult, +}; +use solana_rpc_client_api::{config::RpcAccountInfoConfig, response::Response}; use solana_sdk::commitment_config::CommitmentConfig; -use tokio::sync::{mpsc, oneshot}; +use tokio::{ + sync::{mpsc, oneshot, Mutex as AsyncMutex}, + time, +}; use super::{ chain_pubsub_actor::{ @@ -13,6 +27,90 @@ use super::{ errors::RemoteAccountProviderResult, }; +type UnsubscribeFn = Box BoxFuture<'static, ()> + Send>; +type SubscribeResult = PubsubClientResult<( + BoxStream<'static, Response>, + UnsubscribeFn, +)>; + +const MAX_RECONNECT_ATTEMPTS: usize = 5; +const RECONNECT_ATTEMPT_DELAY: Duration = Duration::from_millis(500); + +pub struct PubSubConnection { + client: ArcSwap, + url: String, + reconnect_guard: AsyncMutex<()>, +} + +impl PubSubConnection { + pub async fn new(url: String) -> RemoteAccountProviderResult { + let client = Arc::new(PubsubClient::new(&url).await?).into(); + let reconnect_guard = AsyncMutex::new(()); + Ok(Self { + client, + url, + reconnect_guard, + }) + } + + pub fn url(&self) -> &str { + &self.url + } + + pub async fn account_subscribe( + &self, + pubkey: &Pubkey, + config: RpcAccountInfoConfig, + ) -> SubscribeResult { + let client = self.client.load(); + let config = Some(config.clone()); + let (stream, unsub) = client.account_subscribe(pubkey, config).await?; + // SAFETY: + // the returned stream depends on the used client, which is only ever dropped + // if the connection has been terminated, at which point the stream is useless + // and will be discarded as well, thus it's safe lifetime extension to 'static + let stream = unsafe { + mem::transmute::< + BoxStream<'_, Response>, + BoxStream<'static, Response>, + >(stream) + }; + Ok((stream, unsub)) + } + + pub async fn reconnect(&self) -> PubsubClientResult<()> { + // Prevents multiple reconnect attempts running concurrently + let _guard = match self.reconnect_guard.try_lock() { + Ok(g) => g, + // Reconnect is already in progress + Err(_) => { + // Wait a bit and return to retry subscription + time::sleep(RECONNECT_ATTEMPT_DELAY).await; + return Ok(()); + } + }; + let mut attempt = 1; + let client = loop { + match PubsubClient::new(&self.url).await { + Ok(c) => break Arc::new(c), + Err(error) => { + warn!( + "failed to reconnect to ws endpoint at {} {error}", + self.url + ); + if attempt == MAX_RECONNECT_ATTEMPTS { + return Err(error); + } + attempt += 1; + time::sleep(RECONNECT_ATTEMPT_DELAY).await; + } + } + }; + self.client.store(client); + Ok(()) + } +} + // ----------------- // Trait // ----------------- @@ -27,9 +125,31 @@ pub trait ChainPubsubClient: Send + Sync + Clone + 'static { pubkey: Pubkey, ) -> RemoteAccountProviderResult<()>; async fn shutdown(&self); - async fn recycle_connections(&self); fn take_updates(&self) -> mpsc::Receiver; + + /// Provides the total number of subscriptions and the number of + /// subscriptions when excludig pubkeys in `exclude`. + /// - `exclude`: Optional slice of pubkeys to exclude from the count. + /// Returns a tuple of (total subscriptions, filtered subscriptions). + async fn subscription_count( + &self, + exclude: Option<&[Pubkey]>, + ) -> (usize, usize); + + fn subscriptions(&self) -> Vec; +} + +#[async_trait] +pub trait ReconnectableClient { + /// Attempts to reconnect to the pubsub server and should be invoked when the client sent the + /// abort signal. + async fn try_reconnect(&self) -> RemoteAccountProviderResult<()>; + /// Re-subscribes to multiple accounts after a reconnection. + async fn resub_multiple( + &self, + pubkeys: &[Pubkey], + ) -> RemoteAccountProviderResult<()>; } // ----------------- @@ -44,10 +164,15 @@ pub struct ChainPubsubClientImpl { impl ChainPubsubClientImpl { pub async fn try_new_from_url( pubsub_url: &str, + abort_sender: mpsc::Sender<()>, commitment: CommitmentConfig, ) -> RemoteAccountProviderResult { - let (actor, updates) = - ChainPubsubActor::new_from_url(pubsub_url, commitment).await?; + let (actor, updates) = ChainPubsubActor::new_from_url( + pubsub_url, + abort_sender, + commitment, + ) + .await?; Ok(Self { actor: Arc::new(actor), updates_rcvr: Arc::new(Mutex::new(Some(updates))), @@ -61,38 +186,6 @@ impl ChainPubsubClient for ChainPubsubClientImpl { self.actor.shutdown().await; } - async fn recycle_connections(&self) { - // Fire a recycle request to the actor and await the acknowledgement. - // If recycle fails there is nothing the caller could do, so we log an error instead - let (tx, rx) = oneshot::channel(); - if let Err(err) = self - .actor - .send_msg(ChainPubsubActorMessage::RecycleConnections { - response: tx, - }) - .await - { - error!( - "ChainPubsubClientImpl::recycle_connections: failed to send RecycleConnections: {err:?}" - ); - return; - } - let res = match rx.await { - Ok(r) => r, - Err(err) => { - error!( - "ChainPubsubClientImpl::recycle_connections: actor dropped recycle ack: {err:?}" - ); - return; - } - }; - if let Err(err) = res { - error!( - "ChainPubsubClientImpl::recycle_connections: recycle failed: {err:?}" - ); - } - } - fn take_updates(&self) -> mpsc::Receiver { // SAFETY: This can only be None if `take_updates` is called more than // once (double-take). That indicates a logic bug in the calling code. @@ -117,7 +210,10 @@ impl ChainPubsubClient for ChainPubsubClientImpl { }) .await?; - rx.await? + rx.await + .inspect_err(|err| { + warn!("ChainPubsubClientImpl::subscribe - RecvError occurred while awaiting subscription response for {}: {err:?}. This indicates the actor sender was dropped without responding.", pubkey); + })? } async fn unsubscribe( @@ -132,7 +228,53 @@ impl ChainPubsubClient for ChainPubsubClientImpl { }) .await?; - rx.await? + rx.await + .inspect_err(|err| { + warn!("ChainPubsubClientImpl::unsubscribe - RecvError occurred while awaiting unsubscription response for {}: {err:?}. This indicates the actor sender was dropped without responding.", pubkey); + })? + } + + async fn subscription_count( + &self, + exclude: Option<&[Pubkey]>, + ) -> (usize, usize) { + let total = self.actor.subscription_count(&[]); + let filtered = if let Some(exclude) = exclude { + self.actor.subscription_count(exclude) + } else { + total + }; + (total, filtered) + } + + fn subscriptions(&self) -> Vec { + self.actor.subscriptions() + } +} + +#[async_trait] +impl ReconnectableClient for ChainPubsubClientImpl { + async fn try_reconnect(&self) -> RemoteAccountProviderResult<()> { + let (tx, rx) = oneshot::channel(); + self.actor + .send_msg(ChainPubsubActorMessage::Reconnect { response: tx }) + .await?; + + rx.await.inspect_err(|err| { + warn!("RecvError occurred while awaiting reconnect response: {err:?}."); + })? + } + + async fn resub_multiple( + &self, + pubkeys: &[Pubkey], + ) -> RemoteAccountProviderResult<()> { + for &pubkey in pubkeys { + self.subscribe(pubkey).await?; + // Don't spam the RPC provider - for 5,000 accounts we would take 250 secs = ~4 minutes + tokio::time::sleep(Duration::from_millis(50)).await; + } + Ok(()) } } @@ -141,13 +283,7 @@ impl ChainPubsubClient for ChainPubsubClientImpl { // ----------------- #[cfg(any(test, feature = "dev-context"))] pub mod mock { - use std::{ - collections::HashSet, - sync::{ - atomic::{AtomicU64, Ordering}, - Mutex, - }, - }; + use std::{collections::HashSet, sync::Mutex, time::Duration}; use log::*; use solana_account::Account; @@ -158,13 +294,17 @@ pub mod mock { use solana_sdk::clock::Slot; use super::*; + use crate::remote_account_provider::{ + RemoteAccountProviderError, RemoteAccountProviderResult, + }; #[derive(Clone)] pub struct ChainPubsubClientMock { updates_sndr: mpsc::Sender, updates_rcvr: Arc>>>, subscribed_pubkeys: Arc>>, - recycle_calls: Arc, + connected: Arc>, + pending_resubscribe_failures: Arc>, } impl ChainPubsubClientMock { @@ -176,12 +316,20 @@ pub mod mock { updates_sndr, updates_rcvr: Arc::new(Mutex::new(Some(updates_rcvr))), subscribed_pubkeys: Arc::new(Mutex::new(HashSet::new())), - recycle_calls: Arc::new(AtomicU64::new(0)), + connected: Arc::new(Mutex::new(true)), + pending_resubscribe_failures: Arc::new(Mutex::new(0)), } } - pub fn recycle_calls(&self) -> u64 { - self.recycle_calls.load(Ordering::SeqCst) + /// Simulate a disconnect: clear all subscriptions and mark client as disconnected. + pub fn simulate_disconnect(&self) { + *self.connected.lock().unwrap() = false; + self.subscribed_pubkeys.lock().unwrap().clear(); + } + + /// Fail the next N resubscription attempts in resub_multiple(). + pub fn fail_next_resubscriptions(&self, n: usize) { + *self.pending_resubscribe_failures.lock().unwrap() = n; } async fn send(&self, update: SubscriptionUpdate) { @@ -225,10 +373,6 @@ pub mod mock { #[async_trait] impl ChainPubsubClient for ChainPubsubClientMock { - async fn recycle_connections(&self) { - self.recycle_calls.fetch_add(1, Ordering::SeqCst); - } - fn take_updates(&self) -> mpsc::Receiver { // SAFETY: This can only be None if `take_updates` is called more // than once (double take). That would indicate a logic bug in the @@ -242,6 +386,13 @@ pub mod mock { &self, pubkey: Pubkey, ) -> RemoteAccountProviderResult<()> { + if !*self.connected.lock().unwrap() { + return Err( + RemoteAccountProviderError::AccountSubscriptionsFailed( + "mock: subscribe while disconnected".to_string(), + ), + ); + } let mut subscribed_pubkeys = self.subscribed_pubkeys.lock().unwrap(); subscribed_pubkeys.insert(pubkey); @@ -259,5 +410,60 @@ pub mod mock { } async fn shutdown(&self) {} + + async fn subscription_count( + &self, + exclude: Option<&[Pubkey]>, + ) -> (usize, usize) { + let pubkeys: Vec = { + let subs = self.subscribed_pubkeys.lock().unwrap(); + subs.iter().cloned().collect() + }; + let total = pubkeys.len(); + let exclude = exclude.unwrap_or_default(); + let filtered = pubkeys + .iter() + .filter(|pubkey| !exclude.contains(pubkey)) + .count(); + (total, filtered) + } + + fn subscriptions(&self) -> Vec { + let subs = self.subscribed_pubkeys.lock().unwrap(); + subs.iter().copied().collect() + } + } + + #[async_trait] + impl ReconnectableClient for ChainPubsubClientMock { + async fn try_reconnect(&self) -> RemoteAccountProviderResult<()> { + *self.connected.lock().unwrap() = true; + Ok(()) + } + + async fn resub_multiple( + &self, + pubkeys: &[Pubkey], + ) -> RemoteAccountProviderResult<()> { + // Simulate transient resubscription failures + { + let mut to_fail = + self.pending_resubscribe_failures.lock().unwrap(); + if *to_fail > 0 { + *to_fail -= 1; + return Err( + RemoteAccountProviderError::AccountSubscriptionsFailed( + "mock: forced resubscribe failure".to_string(), + ), + ); + } + } + for &pubkey in pubkeys { + self.subscribe(pubkey).await?; + // keep it small; tests shouldn't take long + tokio::time::sleep(Duration::from_millis(10)).await; + } + Ok(()) + } } } diff --git a/magicblock-chainlink/src/remote_account_provider/config.rs b/magicblock-chainlink/src/remote_account_provider/config.rs index be2aa0f1a..98f063df1 100644 --- a/magicblock-chainlink/src/remote_account_provider/config.rs +++ b/magicblock-chainlink/src/remote_account_provider/config.rs @@ -9,12 +9,25 @@ pub const DEFAULT_SUBSCRIBED_ACCOUNTS_LRU_CAPACITY: usize = 10_000; pub struct RemoteAccountProviderConfig { subscribed_accounts_lru_capacity: usize, lifecycle_mode: LifecycleMode, + enable_subscription_metrics: bool, } impl RemoteAccountProviderConfig { pub fn try_new( subscribed_accounts_lru_capacity: usize, lifecycle_mode: LifecycleMode, + ) -> RemoteAccountProviderResult { + Self::try_new_with_metrics( + subscribed_accounts_lru_capacity, + lifecycle_mode, + true, + ) + } + + pub fn try_new_with_metrics( + subscribed_accounts_lru_capacity: usize, + lifecycle_mode: LifecycleMode, + enable_subscription_metrics: bool, ) -> RemoteAccountProviderResult { if subscribed_accounts_lru_capacity == 0 { return Err(RemoteAccountProviderError::InvalidLruCapacity( @@ -24,6 +37,7 @@ impl RemoteAccountProviderConfig { Ok(Self { subscribed_accounts_lru_capacity, lifecycle_mode, + enable_subscription_metrics, }) } @@ -41,6 +55,10 @@ impl RemoteAccountProviderConfig { pub fn subscribed_accounts_lru_capacity(&self) -> usize { self.subscribed_accounts_lru_capacity } + + pub fn enable_subscription_metrics(&self) -> bool { + self.enable_subscription_metrics + } } impl Default for RemoteAccountProviderConfig { @@ -49,6 +67,7 @@ impl Default for RemoteAccountProviderConfig { subscribed_accounts_lru_capacity: DEFAULT_SUBSCRIBED_ACCOUNTS_LRU_CAPACITY, lifecycle_mode: LifecycleMode::default(), + enable_subscription_metrics: true, } } } diff --git a/magicblock-chainlink/src/remote_account_provider/lru_cache.rs b/magicblock-chainlink/src/remote_account_provider/lru_cache.rs index 6143026b2..4b95f7322 100644 --- a/magicblock-chainlink/src/remote_account_provider/lru_cache.rs +++ b/magicblock-chainlink/src/remote_account_provider/lru_cache.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, num::NonZeroUsize, sync::Mutex}; use log::*; use lru::LruCache; +use magicblock_metrics::metrics::inc_evicted_accounts_count; use solana_pubkey::Pubkey; use solana_sdk::sysvar; @@ -79,6 +80,7 @@ impl AccountsLruCache { .map(|(evicted_pubkey, _)| evicted_pubkey); if let Some(evicted_pubkey) = evicted { + inc_evicted_accounts_count(); debug_assert_ne!( evicted_pubkey, pubkey, "Should not evict the same pubkey that we added" @@ -113,6 +115,30 @@ impl AccountsLruCache { false } } + + pub fn len(&self) -> usize { + let subs = self + .subscribed_accounts + .lock() + .expect("subscribed_accounts lock poisoned"); + subs.len() + } + + pub fn never_evicted_accounts(&self) -> Vec { + self.accounts_to_never_evict.iter().cloned().collect() + } + + pub fn can_evict(&self, pubkey: &Pubkey) -> bool { + !self.accounts_to_never_evict.contains(pubkey) + } + + pub fn pubkeys(&self) -> Vec { + let subs = self + .subscribed_accounts + .lock() + .expect("subscribed_accounts lock poisoned"); + subs.iter().map(|(k, _)| *k).collect() + } } #[cfg(test)] @@ -237,4 +263,14 @@ mod tests { assert_eq!(evicted, Some(expected_evicted)); } } + + #[test] + fn test_never_evicted_accounts() { + let capacity = NonZeroUsize::new(3).unwrap(); + let cache = AccountsLruCache::new(capacity); + + let never_evicted = cache.never_evicted_accounts(); + // Should contain at least the clock sysvar + assert!(never_evicted.contains(&sysvar::clock::id())); + } } diff --git a/magicblock-chainlink/src/remote_account_provider/mod.rs b/magicblock-chainlink/src/remote_account_provider/mod.rs index 2daef9c1b..3c3700a75 100644 --- a/magicblock-chainlink/src/remote_account_provider/mod.rs +++ b/magicblock-chainlink/src/remote_account_provider/mod.rs @@ -1,11 +1,10 @@ use std::{ - collections::HashMap, + collections::{hash_map::Entry, HashMap, HashSet}, num::NonZeroUsize, sync::{ atomic::{AtomicU64, Ordering}, Arc, Mutex, }, - time::Duration, }; pub(crate) use chain_pubsub_client::{ @@ -33,7 +32,8 @@ use solana_rpc_client_api::{ use solana_sdk::{commitment_config::CommitmentConfig, sysvar::clock}; use tokio::{ sync::{mpsc, oneshot}, - task::{self, JoinSet}, + task, + time::{self, Duration}, }; pub(crate) mod chain_pubsub_actor; @@ -46,14 +46,24 @@ pub mod program_account; mod remote_account; pub use chain_pubsub_actor::SubscriptionUpdate; +use magicblock_metrics::{ + metrics, + metrics::{ + inc_account_fetches_failed, inc_account_fetches_found, + inc_account_fetches_not_found, inc_account_fetches_success, + set_monitored_accounts_count, + }, +}; pub use remote_account::{ResolvedAccount, ResolvedAccountSharedData}; use crate::{errors::ChainlinkResult, submux::SubMuxClient}; -// Simple tracking for accounts currently being fetched to handle race conditions +const ACTIVE_SUBSCRIPTIONS_UPDATE_INTERVAL_MS: u64 = 60_000; + // Maps pubkey -> (fetch_start_slot, requests_waiting) +type FetchResult = Result; type FetchingAccounts = - Mutex>)>>; + Mutex>)>>; pub struct ForwardedSubscriptionUpdate { pub pubkey: Pubkey, @@ -63,6 +73,8 @@ pub struct ForwardedSubscriptionUpdate { unsafe impl Send for ForwardedSubscriptionUpdate {} unsafe impl Sync for ForwardedSubscriptionUpdate {} +// Not sure why helius uses a different code for this error +const HELIUS_CONTEXT_SLOT_NOT_REACHED: i64 = -32603; pub struct RemoteAccountProvider { /// The RPC client to fetch accounts from chain the first time we receive /// a request for them @@ -84,7 +96,7 @@ pub struct RemoteAccountProvider { received_updates_count: Arc, /// Tracks which accounts are currently subscribed to - subscribed_accounts: AccountsLruCache, + lrucache_subscribed_accounts: Arc, /// Channel to notify when an account is removed from the cache and thus no /// longer being watched @@ -94,6 +106,9 @@ pub struct RemoteAccountProvider { removed_account_rx: Mutex>>, subscription_forwarder: Arc>, + + /// Task that periodically updates the active subscriptions gauge + _active_subscriptions_task_handle: Option>, } // ----------------- @@ -184,6 +199,79 @@ impl RemoteAccountProvider { Ok(None) } } + + /// Creates a background task that periodically updates the active subscriptions gauge + fn start_active_subscriptions_updater( + subscribed_accounts: Arc, + pubsub_client: Arc, + ) -> task::JoinHandle<()> { + task::spawn(async move { + let mut interval = time::interval(Duration::from_millis( + ACTIVE_SUBSCRIPTIONS_UPDATE_INTERVAL_MS, + )); + let never_evicted = subscribed_accounts.never_evicted_accounts(); + + loop { + interval.tick().await; + let lru_count = subscribed_accounts.len(); + let (pubsub_total, pubsub_without_never_evict) = pubsub_client + .subscription_count(Some(&never_evicted)) + .await; + + let all_pubsub_subs = if log::log_enabled!(log::Level::Debug) { + pubsub_client.subscriptions() + } else { + vec![] + }; + if lru_count != pubsub_without_never_evict { + warn!( + "User account subscription counts LRU cache={} pubsub client={} don't match", + lru_count, pubsub_without_never_evict + ); + if log::log_enabled!(log::Level::Debug) { + // Log all pubsub subscriptions for debugging + trace!( + "All pubsub subscriptions: {:?}", + all_pubsub_subs + ); + + // Find extra keys in pubsub that are not in LRU cache + let lru_pubkeys = subscribed_accounts.pubkeys(); + let pubsub_subs_without_never_evict: HashSet<_> = + all_pubsub_subs + .iter() + .filter(|pk| !never_evicted.contains(pk)) + .copied() + .collect(); + let lru_pubkeys_set: HashSet<_> = + lru_pubkeys.into_iter().collect(); + + let extra_in_pubsub: Vec<_> = + pubsub_subs_without_never_evict + .difference(&lru_pubkeys_set) + .cloned() + .collect(); + let extra_in_lru: Vec<_> = lru_pubkeys_set + .difference(&pubsub_subs_without_never_evict) + .cloned() + .collect(); + + if !extra_in_pubsub.is_empty() { + debug!("Extra pubkeys in pubsub client not in LRU cache: {:?}", extra_in_pubsub); + } + if !extra_in_lru.is_empty() { + debug!("Extra pubkeys in LRU cache not in pubsub client: {:?}", extra_in_lru); + } + } + } + + debug!("Updating active subscriptions: count={}", pubsub_total); + trace!("All subscriptions: {}", pubkeys_str(&all_pubsub_subs)); + set_monitored_accounts_count(pubsub_total); + } + }) + } + /// Creates a new instance of the remote account provider /// By the time this method returns the current chain slot was resolved and /// a subscription setup to keep it up to date. @@ -195,6 +283,24 @@ impl RemoteAccountProvider { ) -> RemoteAccountProviderResult { let (removed_account_tx, removed_account_rx) = tokio::sync::mpsc::channel(100); + let subscribed_accounts = Arc::new(AccountsLruCache::new({ + // SAFETY: NonZeroUsize::new only returns None if the value is 0. + // RemoteAccountProviderConfig can only be constructed with + // capacity > 0 + let cap = config.subscribed_accounts_lru_capacity(); + NonZeroUsize::new(cap).expect("non-zero capacity") + })); + + let active_subscriptions_updater = + if config.enable_subscription_metrics() { + Some(Self::start_active_subscriptions_updater( + subscribed_accounts.clone(), + Arc::new(pubsub_client.clone()), + )) + } else { + None + }; + let me = Self { fetching_accounts: Arc::::default(), rpc_client, @@ -202,16 +308,11 @@ impl RemoteAccountProvider { chain_slot: Arc::::default(), last_update_slot: Arc::::default(), received_updates_count: Arc::::default(), - subscribed_accounts: AccountsLruCache::new({ - // SAFETY: NonZeroUsize::new only returns None if the value is 0. - // RemoteAccountProviderConfig can only be constructed with - // capacity > 0 - let cap = config.subscribed_accounts_lru_capacity(); - NonZeroUsize::new(cap).expect("non-zero capacity") - }), + lrucache_subscribed_accounts: subscribed_accounts.clone(), subscription_forwarder: Arc::new(subscription_forwarder), removed_account_tx, removed_account_rx: Mutex::new(Some(removed_account_rx)), + _active_subscriptions_task_handle: active_subscriptions_updater, }; let updates = me.pubsub_client.take_updates(); @@ -257,15 +358,17 @@ impl RemoteAccountProvider { }; // Build pubsub clients and wrap them into a SubMuxClient - let mut pubsubs: Vec> = + let mut pubsubs: Vec<(Arc, mpsc::Receiver<()>)> = Vec::with_capacity(endpoints.len()); for ep in endpoints { + let (abort_tx, abort_rx) = mpsc::channel(1); let client = ChainPubsubClientImpl::try_new_from_url( ep.pubsub_url.as_str(), + abort_tx, commitment, ) .await?; - pubsubs.push(Arc::new(client)); + pubsubs.push((Arc::new(client), abort_rx)); } let submux = SubMuxClient::new(pubsubs, None); @@ -277,7 +380,7 @@ impl RemoteAccountProvider { } pub(crate) fn promote_accounts(&self, pubkeys: &[&Pubkey]) { - self.subscribed_accounts.promote_multi(pubkeys); + self.lrucache_subscribed_accounts.promote_multi(pubkeys); } pub fn try_get_removed_account_rx( @@ -361,7 +464,8 @@ impl RemoteAccountProvider { // Resolve all pending requests with subscription data for sender in pending_requests { - let _ = sender.send(remote_account.clone()); + let _ = + sender.send(Ok(remote_account.clone())); } None } else { @@ -518,8 +622,8 @@ impl RemoteAccountProvider { return Ok(vec![]); } - if log_enabled!(log::Level::Debug) { - debug!("Fetching accounts: [{}]", pubkeys_str(pubkeys)); + if log_enabled!(log::Level::Trace) { + trace!("Fetching accounts: [{}]", pubkeys_str(pubkeys)); } // Create channels for potential subscription updates to override fetch results @@ -530,7 +634,14 @@ impl RemoteAccountProvider { let mut fetching = self.fetching_accounts.lock().unwrap(); for &pubkey in pubkeys { let (sender, receiver) = oneshot::channel(); - fetching.insert(pubkey, (fetch_start_slot, vec![sender])); + match fetching.entry(pubkey) { + Entry::Occupied(mut entry) => { + entry.get_mut().1.push(sender); + } + Entry::Vacant(entry) => { + entry.insert((fetch_start_slot, vec![sender])); + } + } subscription_overrides.push((pubkey, receiver)); } } @@ -550,10 +661,23 @@ impl RemoteAccountProvider { subscription_overrides.into_iter().enumerate() { match receiver.await { - Ok(remote_account) => resolved_accounts.push(remote_account), + Ok(result) => match result { + Ok(remote_account) => { + resolved_accounts.push(remote_account) + } + Err(err) => { + error!("Failed to fetch account {pubkey}: {err}"); + errors.push((idx, err)); + } + }, Err(err) => { + warn!("RemoteAccountProvider::try_get_multi - Unexpected RecvError while awaiting account {pubkey} at index {idx}: {err:?}. This should not happen with Result-based channels. Context: fetch_start_slot={fetch_start_slot}, min_context_slot={min_context_slot}, total_pubkeys={}", + pubkeys.len()); error!("Failed to resolve account {pubkey}: {err:?}"); - errors.push((idx, err)); + errors.push(( + idx, + RemoteAccountProviderError::RecvrError(err), + )); } } } @@ -586,66 +710,21 @@ impl RemoteAccountProvider { async fn setup_subscriptions( &self, - subscribe_and_fetch: &[(Pubkey, oneshot::Receiver)], + subscribe_and_fetch: &[(Pubkey, oneshot::Receiver)], ) -> RemoteAccountProviderResult<()> { - if log_enabled!(log::Level::Debug) { + if log_enabled!(log::Level::Trace) { let pubkeys = subscribe_and_fetch .iter() .map(|(pk, _)| pk.to_string()) .collect::>() .join(", "); - debug!("Subscribing to accounts: {pubkeys}"); - } - let subscription_results = { - let mut set = JoinSet::new(); - for (pubkey, _) in subscribe_and_fetch.iter() { - let pc = self.pubsub_client.clone(); - let pubkey = *pubkey; - set.spawn(async move { pc.subscribe(pubkey).await }); - } - set + trace!("Subscribing to accounts: {pubkeys}"); } - .join_all() - .await; - - let (new_subs, errs) = subscription_results - .into_iter() - .enumerate() - .fold((vec![], vec![]), |(mut new_subs, mut errs), (idx, res)| { - match res { - Ok(_) => { - if let Some((pubkey, _)) = subscribe_and_fetch.get(idx) - { - new_subs.push(pubkey); - } - } - Err(err) => errs.push((idx, err)), - } - (new_subs, errs) - }); - - if errs.is_empty() { - for pubkey in new_subs { - // Register the subscription for the pubkey - self.register_subscription(pubkey).await?; - } - Ok(()) - } else { - Err(RemoteAccountProviderError::AccountSubscriptionsFailed( - errs.iter() - .map(|(idx, err)| { - let pubkey = subscribe_and_fetch - .get(*idx) - .map(|(pk, _)| pk.to_string()) - .unwrap_or_else(|| { - "BUG: could not match pubkey".to_string() - }); - format!("{pubkey}: {err:?}") - }) - .collect::>() - .join(",\n"), - )) + for (pubkey, _) in subscribe_and_fetch.iter() { + // Register the subscription for the pubkey (handles LRU cache and eviction first) + self.subscribe(pubkey).await?; } + Ok(()) } /// Registers a new subscription for the given pubkey. @@ -653,17 +732,26 @@ impl RemoteAccountProvider { &self, pubkey: &Pubkey, ) -> RemoteAccountProviderResult<()> { - // If an account is evicted then we need to unsubscribe from it first + // 1. First realize subscription + self.pubsub_client.subscribe(*pubkey).await?; + + // 2. Add to LRU cache + // If an account is evicted then we need to unsubscribe from it // and then inform upstream that we are no longer tracking it - if let Some(evicted) = self.subscribed_accounts.add(*pubkey) { + if let Some(evicted) = self.lrucache_subscribed_accounts.add(*pubkey) { trace!("Evicting {pubkey}"); - // 1. Unsubscribe from the account - self.unsubscribe(&evicted).await?; + // 1. Unsubscribe from the account directly (LRU has already removed it) + if let Err(err) = self.pubsub_client.unsubscribe(evicted).await { + // Should we retry here? + warn!( + "Failed to unsubscribe from pubsub for evicted account {evicted}: {err:?}"); + } // 2. Inform upstream so it can remove it from the store self.send_removal_update(evicted).await?; } + Ok(()) } @@ -681,7 +769,7 @@ impl RemoteAccountProvider { /// This does not consider accounts like the clock sysvar that are watched as /// part of the provider's internal logic. pub fn is_watching(&self, pubkey: &Pubkey) -> bool { - self.subscribed_accounts.contains(pubkey) + self.lrucache_subscribed_accounts.contains(pubkey) } /// Check if an account is currently pending (being fetched) @@ -696,12 +784,12 @@ impl RemoteAccountProvider { pubkey: &Pubkey, ) -> RemoteAccountProviderResult<()> { if self.is_watching(pubkey) { + // Promote in LRU cache even if already subscribed + self.lrucache_subscribed_accounts.add(*pubkey); return Ok(()); } - self.subscribed_accounts.add(*pubkey); - self.pubsub_client.subscribe(*pubkey).await?; - + self.register_subscription(pubkey).await?; Ok(()) } @@ -710,10 +798,34 @@ impl RemoteAccountProvider { &self, pubkey: &Pubkey, ) -> RemoteAccountProviderResult<()> { - // Only maintain subscriptions if we were actually subscribed - if self.subscribed_accounts.remove(pubkey) { - self.pubsub_client.unsubscribe(*pubkey).await?; - self.send_removal_update(*pubkey).await?; + if !self.lrucache_subscribed_accounts.can_evict(pubkey) { + warn!( + "Tried to unsubscribe from account {} that should never be evicted", + pubkey + ); + return Ok(()); + } + if !self.lrucache_subscribed_accounts.contains(pubkey) { + warn!( + "Tried to unsubscribe from account {} that was not subscribed in the LRU cache", + pubkey + ); + return Ok(()); + } + + match self.pubsub_client.unsubscribe(*pubkey).await { + Ok(()) => { + // Only remove from LRU cache after successful pubsub unsubscribe + self.lrucache_subscribed_accounts.remove(pubkey); + self.send_removal_update(*pubkey).await?; + } + Err(err) => { + warn!( + "Failed to unsubscribe from pubsub for {pubkey}: {err:?}" + ); + // Don't remove from LRU cache if pubsub unsubscribe failed + // This ensures LRU cache and pubsub client stay in sync + } } Ok(()) @@ -732,19 +844,6 @@ impl RemoteAccountProvider { min_context_slot: u64, ) { const MAX_RETRIES: u64 = 10; - let mut remaining_retries: u64 = MAX_RETRIES; - macro_rules! retry { - ($msg:expr) => { - trace!($msg); - remaining_retries -= 1; - if remaining_retries <= 0 { - error!("Max retries {MAX_RETRIES} reached, giving up on fetching accounts: {pubkeys:?}"); - return; - } - tokio::time::sleep(Duration::from_millis(400)).await; - continue; - } - } let rpc_client = self.rpc_client.clone(); let fetching_accounts = self.fetching_accounts.clone(); @@ -754,14 +853,49 @@ impl RemoteAccountProvider { tokio::spawn(async move { use RemoteAccount::*; - if log_enabled!(log::Level::Debug) { - debug!("Fetch ({})", pubkeys_str(&pubkeys)); + // Helper to notify all pending requests of fetch failure + let notify_error = |error_msg: &str| { + let mut fetching = fetching_accounts.lock().unwrap(); + error!("{error_msg}"); + inc_account_fetches_failed(pubkeys.len() as u64); + for pubkey in &pubkeys { + // Update metrics + // Remove pending requests and send error + if let Some((_, requests)) = fetching.remove(pubkey) { + for sender in requests { + let error = RemoteAccountProviderError::AccountResolutionsFailed( + format!("{}: {}", pubkey, error_msg) + ); + let _ = sender.send(Err(error)); + } + } + } + }; + + let mut remaining_retries: u64 = MAX_RETRIES; + + if log_enabled!(log::Level::Trace) { + trace!("Fetch ({})", pubkeys_str(&pubkeys)); } + macro_rules! retry { + ($msg:expr) => {{ + trace!($msg); + remaining_retries -= 1; + if remaining_retries <= 0 { + let err_msg = format!("Max retries {MAX_RETRIES} reached, giving up on fetching accounts: {pubkeys:?}"); + notify_error(&err_msg); + return; + } + tokio::time::sleep(Duration::from_millis(400)).await; + continue; + }}; + } let response = loop { // We provide the min_context slot in order to _force_ the RPC to update // its account cache. Otherwise we could just keep fetching the accounts // until the context slot is high enough. + metrics::inc_remote_account_provider_a_count(); match rpc_client .get_multiple_accounts_with_config( &pubkeys, @@ -786,54 +920,58 @@ impl RemoteAccountProvider { ErrorKind::RpcError(rpc_err) => { match rpc_err { RpcError::ForUser(ref rpc_user_err) => { - // When an account is not present for the desired min-context slot - // then we normally get the below handled `RpcResponseError`, but may also - // get the following error from the RPC. + // When an account is not present for the desired + // min-context slot then we normally get the below + // handled `RpcResponseError`, but may also get the + // following error from the RPC. // See test::ixtest_existing_account_for_future_slot // ``` // RpcError( // ForUser( // "AccountNotFound: \ - // pubkey=DaeruQ4SukTQaJA5muyv51MQZok7oaCAF8fAW19mbJv5: \ + // pubkey=DaeruQ4SukTQaJA5muyv51MQZok7oaCAF8fAW19mbJv5: \ // RPC response error -32016: \ // Minimum context slot has not been reached; ", // ), // ) // ``` retry!("Fetching accounts failed: {rpc_user_err:?}"); - } + } RpcError::RpcResponseError { code, message, data, } => { - if code == JSON_RPC_SERVER_ERROR_MIN_CONTEXT_SLOT_NOT_REACHED { - retry!("Minimum context slot {min_context_slot} not reached for {commitment:?}."); + if code == JSON_RPC_SERVER_ERROR_MIN_CONTEXT_SLOT_NOT_REACHED || code == HELIUS_CONTEXT_SLOT_NOT_REACHED { + retry!("Minimum context slot {min_context_slot} not reached for {commitment:?}. code={code}, message={message}, data={data:?}"); } else { let err = RpcError::RpcResponseError { code, message, data, }; - error!( + let err_msg = format!( "RpcError fetching accounts {}: {err:?}", pubkeys_str(&pubkeys) ); + notify_error(&err_msg); return; } } err => { - error!( + let err_msg = format!( "RpcError fetching accounts {}: {err:?}", pubkeys_str(&pubkeys) ); - return; - } + notify_error(&err_msg); + return; + } } } _ => { - error!( + let err_msg = format!( "RpcError fetching accounts {}: {err:?}", pubkeys_str(&pubkeys) ); + notify_error(&err_msg); return; } }, @@ -843,16 +981,23 @@ impl RemoteAccountProvider { // TODO: should we retry if not or respond with an error? assert!(response.context.slot >= min_context_slot); + let mut found_count = 0u64; + let mut not_found_count = 0u64; + let remote_accounts: Vec = pubkeys .iter() .zip(response.value) .map(|(pubkey, acc)| match acc { - Some(value) => RemoteAccount::from_fresh_account( - value, - response.context.slot, - RemoteAccountUpdateSource::Fetch, - ), + Some(value) => { + found_count += 1; + RemoteAccount::from_fresh_account( + value, + response.context.slot, + RemoteAccountUpdateSource::Fetch, + ) + } None if mark_empty_if_not_found.contains(pubkey) => { + not_found_count += 1; RemoteAccount::from_fresh_account( Account { lamports: 0, @@ -865,10 +1010,18 @@ impl RemoteAccountProvider { RemoteAccountUpdateSource::Fetch, ) } - None => NotFound(response.context.slot), + None => { + not_found_count += 1; + NotFound(response.context.slot) + } }) .collect(); + // Update metrics for successful RPC fetch + inc_account_fetches_success(pubkeys.len() as u64); + inc_account_fetches_found(found_count); + inc_account_fetches_not_found(not_found_count); + if log_enabled!(log::Level::Trace) { let pubkeys = pubkeys .iter() @@ -903,7 +1056,7 @@ impl RemoteAccountProvider { // Send the fetch result to all waiting requests for request in requests { - let _ = request.send(remote_account.clone()); + let _ = request.send(Ok(remote_account.clone())); } } }); @@ -1005,11 +1158,17 @@ mod test { let pubsub_client = chain_pubsub_client::mock::ChainPubsubClientMock::new(tx, rx); let (fwd_tx, _fwd_rx) = mpsc::channel(100); + let config = RemoteAccountProviderConfig::try_new_with_metrics( + 1000, + LifecycleMode::Ephemeral, + false, + ) + .unwrap(); RemoteAccountProvider::new( rpc_client, pubsub_client, fwd_tx, - &RemoteAccountProviderConfig::default(), + &config, ) .await .unwrap() @@ -1049,11 +1208,18 @@ mod test { ( { let (fwd_tx, _fwd_rx) = mpsc::channel(100); + let config = + RemoteAccountProviderConfig::try_new_with_metrics( + 1000, + LifecycleMode::Ephemeral, + false, + ) + .unwrap(); RemoteAccountProvider::new( rpc_client.clone(), pubsub_client, fwd_tx, - &RemoteAccountProviderConfig::default(), + &config, ) .await .unwrap() @@ -1121,12 +1287,18 @@ mod test { let pubsub_client = ChainPubsubClientMock::new(tx, rx); let (forward_tx, forward_rx) = mpsc::channel(100); + let config = RemoteAccountProviderConfig::try_new_with_metrics( + 1000, + LifecycleMode::Ephemeral, + false, + ) + .unwrap(); ( RemoteAccountProvider::new( rpc_client, pubsub_client, forward_tx, - &RemoteAccountProviderConfig::default(), + &config, ) .await .unwrap(), @@ -1321,9 +1493,10 @@ mod test { rpc_client, pubsub_client, forward_tx, - &RemoteAccountProviderConfig::try_new( + &RemoteAccountProviderConfig::try_new_with_metrics( accounts_capacity, LifecycleMode::Ephemeral, + false, ) .unwrap(), ) diff --git a/magicblock-chainlink/src/remote_account_provider/program_account.rs b/magicblock-chainlink/src/remote_account_provider/program_account.rs index 6a6930a02..9ca640ce2 100644 --- a/magicblock-chainlink/src/remote_account_provider/program_account.rs +++ b/magicblock-chainlink/src/remote_account_provider/program_account.rs @@ -147,6 +147,7 @@ impl LoadedProgram { /// after the deploy. pub fn try_into_deploy_data_and_ixs_v4( self, + ephem_slot: u64, validator_auth: Pubkey, ) -> ClonerResult { let Self { @@ -156,13 +157,14 @@ impl LoadedProgram { loader, .. } = self; + let five_slots_ago = ephem_slot.saturating_sub(5).max(1); let pre_deploy_loader_state = LoaderV4State { - slot: 1, + slot: five_slots_ago, authority_address_or_next_version: validator_auth, status: LoaderV4Status::Retracted, }; let post_deploy_loader_state = LoaderV4State { - slot: 1, + slot: five_slots_ago, authority_address_or_next_version: authority, status: LoaderV4Status::Deployed, }; @@ -474,7 +476,7 @@ mod tests { loader_status: LoaderV4Status::Deployed, remote_slot: 0, } - .try_into_deploy_data_and_ixs_v4(validator_kp.pubkey()) + .try_into_deploy_data_and_ixs_v4(1, validator_kp.pubkey()) .unwrap(); let recent_blockhash = Hash::new_unique(); diff --git a/magicblock-chainlink/src/remote_account_provider/remote_account.rs b/magicblock-chainlink/src/remote_account_provider/remote_account.rs index bc401a35b..ada3bc48c 100644 --- a/magicblock-chainlink/src/remote_account_provider/remote_account.rs +++ b/magicblock-chainlink/src/remote_account_provider/remote_account.rs @@ -109,6 +109,14 @@ impl ResolvedAccountSharedData { self } + pub fn undelegating(&self) -> bool { + use ResolvedAccountSharedData::*; + match self { + Fresh(account) => account.undelegating(), + Bank(account) => account.undelegating(), + } + } + pub fn set_remote_slot(&mut self, remote_slot: Slot) -> &mut Self { use ResolvedAccountSharedData::*; match self { diff --git a/magicblock-chainlink/src/submux/mod.rs b/magicblock-chainlink/src/submux/mod.rs index 96ba10318..8c4e1f9d6 100644 --- a/magicblock-chainlink/src/submux/mod.rs +++ b/magicblock-chainlink/src/submux/mod.rs @@ -11,14 +11,14 @@ use solana_pubkey::Pubkey; use tokio::sync::mpsc; use crate::remote_account_provider::{ - chain_pubsub_client::ChainPubsubClient, - errors::RemoteAccountProviderResult, SubscriptionUpdate, + chain_pubsub_client::{ChainPubsubClient, ReconnectableClient}, + errors::RemoteAccountProviderResult, + SubscriptionUpdate, }; const SUBMUX_OUT_CHANNEL_SIZE: usize = 5_000; const DEDUP_WINDOW_MILLIS: u64 = 2_000; const DEBOUNCE_INTERVAL_MILLIS: u64 = 2_000; -const DEFAULT_RECYCLE_INTERVAL_MILLIS: u64 = 3_600_000; mod debounce_state; pub use self::debounce_state::DebounceState; @@ -97,7 +97,10 @@ pub struct DebounceConfig { /// - While waiting for eligibility in Enabled state, only the latest /// observed update is kept as pending so that the consumer receives /// the freshest state when the interval elapses. -pub struct SubMuxClient { +pub struct SubMuxClient +where + T: ChainPubsubClient + ReconnectableClient, +{ /// Underlying pubsub clients this mux controls and forwards to/from. clients: Vec>, /// Aggregated outgoing channel used by forwarder tasks to deliver @@ -128,20 +131,6 @@ pub struct SubMuxClient { never_debounce: HashSet, } -/// Configuration for SubMuxClient -#[derive(Debug, Clone, Default)] -pub struct SubMuxClientConfig { - /// The deduplication window in milliseconds. - pub dedupe_window_millis: Option, - /// The debounce interval in milliseconds. - pub debounce_interval_millis: Option, - /// The debounce detection window in milliseconds. - pub debounce_detection_window_millis: Option, - /// Interval (millis) at which to recycle inner client connections. - /// If None, defaults to DEFAULT_RECYCLE_INTERVAL_MILLIS. - pub recycle_interval_millis: Option, -} - // Parameters for the long-running forwarder loop, grouped to avoid // clippy::too_many_arguments and to keep spawn sites concise. struct ForwarderParams { @@ -154,9 +143,9 @@ struct ForwarderParams { allowed_count: usize, } -impl SubMuxClient { +impl SubMuxClient { pub fn new( - clients: Vec>, + clients: Vec<(Arc, mpsc::Receiver<()>)>, dedupe_window_millis: Option, ) -> Self { Self::new_with_debounce( @@ -169,16 +158,15 @@ impl SubMuxClient { } pub fn new_with_debounce( - clients: Vec>, + clients: Vec<(Arc, mpsc::Receiver<()>)>, config: DebounceConfig, ) -> Self { - Self::new_with_configs(clients, config, SubMuxClientConfig::default()) + Self::new_with_config(clients, config) } - pub fn new_with_configs( - clients: Vec>, + pub fn new_with_config( + clients: Vec<(Arc, mpsc::Receiver<()>)>, config: DebounceConfig, - mux_config: SubMuxClientConfig, ) -> Self { let (out_tx, out_rx) = mpsc::channel(SUBMUX_OUT_CHANNEL_SIZE); let dedup_cache = Arc::new(Mutex::new(HashMap::new())); @@ -197,6 +185,8 @@ impl SubMuxClient { let never_debounce: HashSet = vec![solana_sdk::sysvar::clock::ID].into_iter().collect(); + let clients = Self::spawn_reconnectors(clients); + let me = Self { clients, out_tx, @@ -212,10 +202,95 @@ impl SubMuxClient { // Spawn background tasks me.spawn_dedup_pruner(); me.spawn_debounce_flusher(); - me.maybe_spawn_connection_recycler(mux_config.recycle_interval_millis); me } + // ----------------- + // Reconnection + // ----------------- + fn spawn_reconnectors( + clients: Vec<(Arc, mpsc::Receiver<()>)>, + ) -> Vec> { + let clients_only = clients + .iter() + .map(|(c, _)| c.clone()) + .collect::>>(); + for (client, mut abort_rx) in clients.into_iter() { + let clients_clone = clients_only.clone(); + tokio::spawn(async move { + while abort_rx.recv().await.is_some() { + // Drain any duplicate abort signals to coalesce reconnect attempts + while abort_rx.try_recv().is_ok() {} + + debug!( + "Reconnecter received abort signal, reconnecting client" + ); + Self::reconnect_client_with_backoff( + client.clone(), + clients_clone.clone(), + ) + .await; + } + }); + } + clients_only + } + + async fn reconnect_client_with_backoff( + client: Arc, + all_clients: Vec>, + ) { + fn fib_with_max(n: u64) -> u64 { + let (mut a, mut b) = (0u64, 1u64); + for _ in 0..n { + (a, b) = (b, a.saturating_add(b)); + } + a.min(600) + } + + const WARN_EVERY_ATTEMPTS: u64 = 10; + let mut attempt = 0; + loop { + attempt += 1; + if Self::reconnect_client(client.clone(), &all_clients).await { + debug!( + "Successfully reconnected client after {} attempts", + attempt + ); + break; + } else { + if attempt % WARN_EVERY_ATTEMPTS == 0 { + error!("Failed to reconnect ({}) times", attempt); + } + let wait_duration = Duration::from_secs(fib_with_max(attempt)); + tokio::time::sleep(wait_duration).await; + debug!("Reconnect attempt {} failed, will retry", attempt); + } + } + } + + async fn reconnect_client(client: Arc, all_clients: &[Arc]) -> bool { + if let Err(err) = client.try_reconnect().await { + debug!("Failed to reconnect client: {:?}", err); + return false; + } + // Resubscribe all existing subscriptions sourced from still connected clients + // NOTE: that new subscriptions are already received now as well since the + // client marked itself as connected and is no longer blocking subscriptions + // See [ChainPubsubActor::handle_msg] and [ChainPubsubActor::try_reconnect] + let subs = Self::get_subscriptions(all_clients); + match client.resub_multiple(&subs).await { + Err(err) => { + debug!( + "Failed to resubscribe accounts after reconnect: {:?}", + err + ); + false + } + Ok(_) => true, + } + } + fn spawn_dedup_pruner(&self) { let window = self.dedup_window; let cache = self.dedup_cache.clone(); @@ -277,34 +352,6 @@ impl SubMuxClient { }); } - fn maybe_spawn_connection_recycler( - &self, - recycle_interval_millis: Option, - ) { - // Disabled when the interval is explicitly Some(0) - if recycle_interval_millis == Some(0) { - return; - } - let recycle_clients = self.clients.clone(); - let interval = Duration::from_millis( - recycle_interval_millis.unwrap_or(DEFAULT_RECYCLE_INTERVAL_MILLIS), - ); - tokio::spawn(async move { - let mut idx: usize = 0; - loop { - tokio::time::sleep(interval).await; - if recycle_clients.is_empty() { - continue; - } - let len = recycle_clients.len(); - let i = idx % len; - idx = (idx + 1) % len; - let client = recycle_clients[i].clone(); - client.recycle_connections().await; - } - }); - } - fn start_forwarders(&self) { let window = self.dedup_window; let debounce_interval = self.debounce_interval; @@ -499,6 +546,14 @@ impl SubMuxClient { maybe_forward_now } + fn get_subscriptions(clients: &[Arc]) -> Vec { + let mut all_subs = HashSet::new(); + for client in clients { + all_subs.extend(client.subscriptions()); + } + all_subs.into_iter().collect() + } + fn allowed_in_debounce_window_count(&self) -> usize { (self.debounce_detection_window.as_millis() / self.debounce_interval.as_millis()) as usize @@ -515,15 +570,10 @@ impl SubMuxClient { } #[async_trait] -impl ChainPubsubClient for SubMuxClient { - async fn recycle_connections(&self) { - // This recycles all inner clients which may not always make - // sense. Thus we don't expect this call on the Multiplexer itself. - for client in &self.clients { - client.recycle_connections().await; - } - } - +impl ChainPubsubClient for SubMuxClient +where + T: ChainPubsubClient + ReconnectableClient, +{ async fn subscribe( &self, pubkey: Pubkey, @@ -563,6 +613,34 @@ impl ChainPubsubClient for SubMuxClient { self.start_forwarders(); out_rx } + + /// Gets the maximum subscription count across all inner clients. + /// NOTE: one of the clients could be reconnecting and thus + /// temporarily have fewer or no subscriptions + async fn subscription_count( + &self, + exclude: Option<&[Pubkey]>, + ) -> (usize, usize) { + let mut max_total = 0; + let mut max_filtered = 0; + for client in &self.clients { + let (total, filtered) = client.subscription_count(exclude).await; + if total > max_total { + max_total = total; + } + if filtered > max_filtered { + max_filtered = filtered; + } + } + (max_total, max_filtered) + } + + /// Gets the union of all subscriptions across all inner clients. + /// Unless one is reconnecting, this should be identical to + /// getting it from a single inner client. + fn subscriptions(&self) -> Vec { + Self::get_subscriptions(&self.clients) + } } #[cfg(test)] @@ -582,6 +660,53 @@ mod tests { ..Account::default() } } + fn new_submux_client( + clients: Vec>, + dedupe_window_millis: Option, + ) -> SubMuxClient { + let client_tuples = clients + .into_iter() + .map(|c| { + let (_abort_tx, abort_rx) = mpsc::channel(1); + (c, abort_rx) + }) + .collect(); + SubMuxClient::new(client_tuples, dedupe_window_millis) + } + + fn new_submux_client_with_debounce( + clients: Vec>, + config: DebounceConfig, + ) -> SubMuxClient { + let client_tuples = clients + .into_iter() + .map(|c| { + let (_abort_tx, abort_rx) = mpsc::channel(1); + (c, abort_rx) + }) + .collect(); + SubMuxClient::new_with_debounce(client_tuples, config) + } + + fn new_submux_with_abort( + clients: Vec>, + dedupe_window_millis: Option, + ) -> (SubMuxClient, Vec>) { + let mut abort_senders = Vec::new(); + let client_tuples = clients + .into_iter() + .map(|c| { + let (abort_tx, abort_rx) = mpsc::channel(4); + abort_senders.push(abort_tx); + (c, abort_rx) + }) + .collect(); + ( + SubMuxClient::new(client_tuples, dedupe_window_millis), + abort_senders, + ) + } + // ----------------- // Subscribe/Unsubscribe // ----------------- @@ -595,7 +720,7 @@ mod tests { let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - let mux: SubMuxClient = SubMuxClient::new( + let mux: SubMuxClient = new_submux_client( vec![client1.clone(), client2.clone()], Some(100), ); @@ -648,7 +773,7 @@ mod tests { let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - let mux: SubMuxClient = SubMuxClient::new( + let mux: SubMuxClient = new_submux_client( vec![client1.clone(), client2.clone()], Some(100), ); @@ -695,7 +820,7 @@ mod tests { let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - let mux: SubMuxClient = SubMuxClient::new( + let mux: SubMuxClient = new_submux_client( vec![client1.clone(), client2.clone()], Some(100), ); @@ -756,7 +881,7 @@ mod tests { let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - let mux: SubMuxClient = SubMuxClient::new( + let mux: SubMuxClient = new_submux_client( vec![client1.clone(), client2.clone()], Some(100), ); @@ -819,7 +944,7 @@ mod tests { let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); let client3 = Arc::new(ChainPubsubClientMock::new(tx3, rx3)); - let mux: SubMuxClient = SubMuxClient::new( + let mux: SubMuxClient = new_submux_client( vec![client1.clone(), client2.clone(), client3.clone()], Some(100), ); @@ -949,7 +1074,7 @@ mod tests { let (tx, rx) = mpsc::channel(10_000); let client = Arc::new(ChainPubsubClientMock::new(tx, rx)); let mux: SubMuxClient = - SubMuxClient::new_with_debounce( + new_submux_client_with_debounce( vec![client.clone()], DebounceConfig { dedupe_window_millis: Some(100), @@ -1007,7 +1132,7 @@ mod tests { let (tx, rx) = mpsc::channel(10_000); let client = Arc::new(ChainPubsubClientMock::new(tx, rx)); let mux: SubMuxClient = - SubMuxClient::new_with_debounce( + new_submux_client_with_debounce( vec![client.clone()], DebounceConfig { dedupe_window_millis: Some(100), @@ -1045,7 +1170,7 @@ mod tests { let (tx, rx) = mpsc::channel(10_000); let client = Arc::new(ChainPubsubClientMock::new(tx, rx)); let mux: SubMuxClient = - SubMuxClient::new_with_debounce( + new_submux_client_with_debounce( vec![client.clone()], DebounceConfig { dedupe_window_millis: Some(100), @@ -1103,7 +1228,7 @@ mod tests { let (tx, rx) = mpsc::channel(10_000); let client = Arc::new(ChainPubsubClientMock::new(tx, rx)); let mux: SubMuxClient = - SubMuxClient::new_with_debounce( + new_submux_client_with_debounce( vec![client.clone()], DebounceConfig { dedupe_window_millis: Some(100), @@ -1140,60 +1265,120 @@ mod tests { } // ----------------- - // Connection recycling + // Reconnection Tests // ----------------- - async fn setup_recycling( - interval_millis: Option, - ) -> ( - SubMuxClient, - Arc, - Arc, - Arc, - ) { + #[tokio::test] + async fn test_reconnect_on_disconnect_reestablishes_subscriptions() { init_logger(); - let (tx1, rx1) = mpsc::channel(1); - let (tx2, rx2) = mpsc::channel(1); - let (tx3, rx3) = mpsc::channel(1); - let c1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); - let c2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - let c3 = Arc::new(ChainPubsubClientMock::new(tx3, rx3)); - let mux: SubMuxClient = - SubMuxClient::new_with_configs( - vec![c1.clone(), c2.clone(), c3.clone()], - DebounceConfig::default(), - SubMuxClientConfig { - recycle_interval_millis: interval_millis, - ..SubMuxClientConfig::default() - }, - ); + let (tx1, rx1) = mpsc::channel(10_000); + let (tx2, rx2) = mpsc::channel(10_000); + let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); + let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); - (mux, c1, c2, c3) - } - #[tokio::test] - async fn test_connection_recycling_enabled() { - let (mux, c1, c2, c3) = setup_recycling(Some(50)).await; + let (mux, aborts) = new_submux_with_abort( + vec![client1.clone(), client2.clone()], + Some(100), + ); + let mut mux_rx = mux.take_updates(); + + let pk = Pubkey::new_unique(); + mux.subscribe(pk).await.unwrap(); + + // Baseline: client1 update arrives + client1 + .send_account_update(pk, 1, &account_with_lamports(111)) + .await; + tokio::time::timeout( + std::time::Duration::from_millis(200), + mux_rx.recv(), + ) + .await + .expect("got baseline update") + .expect("stream open"); + + // Simulate disconnect: client1 loses subscriptions and is "disconnected" + client1.simulate_disconnect(); - // allow 4 intervals (at ~50ms each) -> calls: c1,c2,c3,c1 - tokio::time::sleep(Duration::from_millis(220)).await; + // Trigger reconnect via abort channel + aborts[0].send(()).await.expect("abort send"); - assert_eq!(c1.recycle_calls(), 2); - assert_eq!(c2.recycle_calls(), 1); - assert_eq!(c3.recycle_calls(), 1); + // Wait for reconnect to complete + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + + // After reconnect + resubscribe, client1's updates should be forwarded again + client1 + .send_account_update(pk, 2, &account_with_lamports(222)) + .await; + + let up = tokio::time::timeout( + std::time::Duration::from_secs(1), + mux_rx.recv(), + ) + .await + .expect("expect update after reconnect") + .expect("stream open"); + assert_eq!(up.pubkey, pk); + assert_eq!(up.rpc_response.context.slot, 2); mux.shutdown().await; } #[tokio::test] - async fn test_connection_recycling_disabled() { - let (mux, c1, c2, c3) = setup_recycling(Some(0)).await; + async fn test_reconnect_after_failed_resubscription_eventually_recovers() { + init_logger(); + + let (tx1, rx1) = mpsc::channel(10_000); + let (tx2, rx2) = mpsc::channel(10_000); + let client1 = Arc::new(ChainPubsubClientMock::new(tx1, rx1)); + let client2 = Arc::new(ChainPubsubClientMock::new(tx2, rx2)); + + let (mux, aborts) = new_submux_with_abort( + vec![client1.clone(), client2.clone()], + Some(100), + ); + let mut mux_rx = mux.take_updates(); - // wait enough time to ensure it would have recycled if enabled - tokio::time::sleep(Duration::from_millis(220)).await; + let pk = Pubkey::new_unique(); + mux.subscribe(pk).await.unwrap(); + + // Prepare: first resubscribe attempt will fail + client1.fail_next_resubscriptions(1); + + // Simulate disconnect: client1 loses subs and is disconnected + client1.simulate_disconnect(); + + // Trigger reconnect; first attempt will fail resub; reconnector will retry after ~1s (fib(1)=1) + aborts[0].send(()).await.expect("abort send"); + + // Send updates until one passes after reconnection and resubscribe succeed + // Keep unique slots to avoid dedupe + let mut slot: u64 = 100; + let deadline = Instant::now() + Duration::from_secs(3); + let mut got = None; + while Instant::now() < deadline { + client1 + .send_account_update( + pk, + slot, + &account_with_lamports(1_000 + slot), + ) + .await; + if let Ok(Some(u)) = tokio::time::timeout( + std::time::Duration::from_millis(200), + mux_rx.recv(), + ) + .await + { + got = Some(u); + break; + } + slot += 1; + } - assert_eq!(c1.recycle_calls(), 0); - assert_eq!(c2.recycle_calls(), 0); - assert_eq!(c3.recycle_calls(), 0); + let up = got.expect("should receive update after retry reconnect"); + assert_eq!(up.pubkey, pk); + assert!(up.rpc_response.context.slot >= 100); mux.shutdown().await; } diff --git a/magicblock-chainlink/src/testing/chain_pubsub.rs b/magicblock-chainlink/src/testing/chain_pubsub.rs index 94f1e8dc7..56a4157d5 100644 --- a/magicblock-chainlink/src/testing/chain_pubsub.rs +++ b/magicblock-chainlink/src/testing/chain_pubsub.rs @@ -16,8 +16,10 @@ pub async fn setup_actor_and_client() -> ( mpsc::Receiver, RpcClient, ) { + let (tx, _) = mpsc::channel(10); let (actor, updates_rx) = ChainPubsubActor::new_from_url( PUBSUB_URL, + tx, CommitmentConfig::confirmed(), ) .await @@ -54,13 +56,13 @@ pub async fn unsubscribe(actor: &ChainPubsubActor, pubkey: Pubkey) { .expect("unsubscribe failed"); } -pub async fn recycle(actor: &ChainPubsubActor) { +pub async fn reconnect(actor: &ChainPubsubActor) { let (tx, rx) = oneshot::channel(); actor - .send_msg(ChainPubsubActorMessage::RecycleConnections { response: tx }) + .send_msg(ChainPubsubActorMessage::Reconnect { response: tx }) .await - .expect("failed to send RecycleConnections message"); + .expect("failed to send Reconnect message"); rx.await - .expect("recycle ack channel dropped") - .expect("recycle failed"); + .expect("reconnect ack channel dropped") + .expect("reconnect failed"); } diff --git a/magicblock-chainlink/src/testing/mod.rs b/magicblock-chainlink/src/testing/mod.rs index fd9769892..423576a64 100644 --- a/magicblock-chainlink/src/testing/mod.rs +++ b/magicblock-chainlink/src/testing/mod.rs @@ -142,6 +142,98 @@ macro_rules! assert_cloned_as_undelegated { }}; } +#[macro_export] +macro_rules! assert_cloned_as_delegated_with_retries { + ($cloner:expr, $pubkeys:expr, $retries:expr) => {{ + for pubkey in $pubkeys { + let mut account_opt = None; + for _ in 0..$retries { + account_opt = $cloner.get_account(pubkey); + if let Some(account) = &account_opt { + if account.delegated() { + break; + } + } + ::std::thread::sleep(::std::time::Duration::from_millis(100)); + } + let account = account_opt + .expect(&format!("Expected account {} to be cloned", pubkey)); + assert!( + account.delegated(), + "Expected account {} to be delegated", + pubkey + ); + } + }}; + ($cloner:expr, $pubkeys:expr, $slot:expr, $retries:expr) => {{ + for pubkey in $pubkeys { + let mut account_opt = None; + for _ in 0..$retries { + account_opt = $cloner.get_account(pubkey); + if let Some(account) = &account_opt { + if account.delegated() && account.remote_slot() == $slot { + break; + } + } + ::std::thread::sleep(::std::time::Duration::from_millis(100)); + } + let account = account_opt + .expect(&format!("Expected account {} to be cloned", pubkey)); + assert!( + account.delegated(), + "Expected account {} to be delegated", + pubkey + ); + assert_eq!( + account.remote_slot(), + $slot, + "Expected account {} to have remote slot {}", + pubkey, + $slot + ); + } + }}; + ($cloner:expr, $pubkeys:expr, $slot:expr, $owner:expr, $retries:expr) => {{ + use solana_account::ReadableAccount; + for pubkey in $pubkeys { + let mut account_opt = None; + for _ in 0..$retries { + account_opt = $cloner.get_account(pubkey); + if let Some(account) = &account_opt { + if account.delegated() + && account.remote_slot() == $slot + && account.owner() == &$owner + { + break; + } + } + ::std::thread::sleep(::std::time::Duration::from_millis(100)); + } + let account = account_opt + .expect(&format!("Expected account {} to be cloned", pubkey)); + assert!( + account.delegated(), + "Expected account {} to be delegated", + pubkey + ); + assert_eq!( + account.remote_slot(), + $slot, + "Expected account {} to have remote slot {}", + pubkey, + $slot + ); + assert_eq!( + account.owner(), + &$owner, + "Expected account {} to have owner {}", + pubkey, + $owner + ); + } + }}; +} + #[macro_export] macro_rules! assert_cloned_as_delegated { ($cloner:expr, $pubkeys:expr) => {{ diff --git a/magicblock-chainlink/tests/utils/test_context.rs b/magicblock-chainlink/tests/utils/test_context.rs index 7c9bbad55..3e41702de 100644 --- a/magicblock-chainlink/tests/utils/test_context.rs +++ b/magicblock-chainlink/tests/utils/test_context.rs @@ -105,6 +105,7 @@ impl TestContext { fetch_cloner, validator_pubkey, faucet_pubkey, + 0, ) .unwrap(); Self { diff --git a/magicblock-committor-service/src/intent_executor/task_info_fetcher.rs b/magicblock-committor-service/src/intent_executor/task_info_fetcher.rs index d76f97c86..2928d2f33 100644 --- a/magicblock-committor-service/src/intent_executor/task_info_fetcher.rs +++ b/magicblock-committor-service/src/intent_executor/task_info_fetcher.rs @@ -8,6 +8,7 @@ use dlp::{ }; use log::{error, warn}; use lru::LruCache; +use magicblock_metrics::metrics; use magicblock_rpc_client::{MagicBlockRpcClientError, MagicblockRpcClient}; use solana_pubkey::Pubkey; @@ -116,6 +117,7 @@ impl CacheTaskInfoFetcher { }) .collect::>(); + metrics::inc_task_info_fetcher_a_count(); let accounts_data = rpc_client .get_multiple_accounts(&pda_accounts, None) .await?; diff --git a/magicblock-metrics/src/metrics/mod.rs b/magicblock-metrics/src/metrics/mod.rs index 96171bcdd..b2f1b44ad 100644 --- a/magicblock-metrics/src/metrics/mod.rs +++ b/magicblock-metrics/src/metrics/mod.rs @@ -38,7 +38,7 @@ lazy_static::lazy_static! { static ref CACHED_CLONE_OUTPUTS_COUNT: IntGauge = IntGauge::new( - "magicblock_account_cloner_cached_outputs", + "magicblock_account_cloner_cached_outputs_count", "Number of cloned accounts in the RemoteAccountClonerWorker" ) .unwrap(); @@ -47,7 +47,7 @@ lazy_static::lazy_static! { // Ledger // ----------------- static ref LEDGER_SIZE_GAUGE: IntGauge = IntGauge::new( - "ledger_size", "Ledger size in Bytes", + "ledger_size_gauge", "Ledger size in Bytes", ).unwrap(); static ref LEDGER_BLOCK_TIMES_GAUGE: IntGauge = IntGauge::new( "ledger_blocktimes_gauge", "Ledger Blocktimes Gauge", @@ -101,29 +101,24 @@ lazy_static::lazy_static! { // Accounts // ----------------- static ref ACCOUNTS_SIZE_GAUGE: IntGauge = IntGauge::new( - "accounts_size", "Size of persisted accounts (in bytes) currently on disk", + "accounts_size_gauge", "Size of persisted accounts (in bytes) currently on disk", ).unwrap(); static ref ACCOUNTS_COUNT_GAUGE: IntGauge = IntGauge::new( - "accounts_count", "Number of accounts currently in the database", + "accounts_count_gauge", "Number of accounts currently in the database", ).unwrap(); static ref PENDING_ACCOUNT_CLONES_GAUGE: IntGauge = IntGauge::new( - "pending_account_clones", "Total number of account clone requests still in memory", + "pending_account_clones_gauge", "Total number of account clone requests still in memory", ).unwrap(); static ref MONITORED_ACCOUNTS_GAUGE: IntGauge = IntGauge::new( - "monitored_accounts", "number of undelegated accounts, being monitored via websocket", + "monitored_accounts_gauge", "number of undelegated accounts, being monitored via websocket", ).unwrap(); - static ref SUBSCRIPTIONS_COUNT_GAUGE: IntGaugeVec = IntGaugeVec::new( - Opts::new("subscriptions_count", "number of active account subscriptions"), - &["shard"], - ).unwrap(); - - static ref EVICTED_ACCOUNTS_COUNT: IntGauge = IntGauge::new( - "evicted_accounts", "number of accounts forcefully removed from monitored list and database", + static ref EVICTED_ACCOUNTS_COUNT: IntCounter = IntCounter::new( + "evicted_accounts_count", "Total cumulative number of accounts forcefully removed from monitored list and database (monotonically increasing)", ).unwrap(); // ----------------- @@ -167,7 +162,7 @@ lazy_static::lazy_static! { ).unwrap(); pub static ref TRANSACTION_SKIP_PREFLIGHT: IntCounter = IntCounter::new( - "transaction_skip_preflight", "Count of transactions that skipped the preflight check", + "transaction_skip_preflight_count", "Count of transactions that skipped the preflight check", ).unwrap(); pub static ref RPC_REQUESTS_COUNT: IntCounterVec = IntCounterVec::new( @@ -180,6 +175,60 @@ lazy_static::lazy_static! { &["name"], ).unwrap(); + // Account fetch results from network (RPC) + pub static ref ACCOUNT_FETCHES_SUCCESS_COUNT: IntCounter = + IntCounter::new( + "account_fetches_success_count", + "Total number of successful network \ + account fetches", + ) + .unwrap(); + + pub static ref ACCOUNT_FETCHES_FAILED_COUNT: IntCounter = + IntCounter::new( + "account_fetches_failed_count", + "Total number of failed network account fetches \ + (RPC errors)", + ) + .unwrap(); + + pub static ref ACCOUNT_FETCHES_FOUND_COUNT: IntCounter = + IntCounter::new( + "account_fetches_found_count", + "Total number of network account fetches that \ + found an account", + ) + .unwrap(); + + pub static ref ACCOUNT_FETCHES_NOT_FOUND_COUNT: IntCounter = + IntCounter::new( + "account_fetches_not_found_count", + "Total number of network account fetches where \ + account was not found", + ) + .unwrap(); + + pub static ref UNDELEGATION_REQUESTED_COUNT: IntCounter = + IntCounter::new( + "undelegation_requested_count", + "Total number of undelegation requests received", + ) + .unwrap(); + + pub static ref UNDELEGATION_COMPLETED_COUNT: IntCounter = + IntCounter::new( + "undelegation_completed_count", + "Total number of completed undelegations detected", + ) + .unwrap(); + + pub static ref UNSTUCK_UNDELEGATION_COUNT: IntCounter = + IntCounter::new( + "unstuck_undelegation_count", + "Total number of undelegating accounts found to be already undelegated on chain", + ) + .unwrap(); + // ----------------- // Transaction Execution @@ -217,7 +266,24 @@ lazy_static::lazy_static! { ).unwrap(); static ref COMMITTOR_INTENT_CU_USAGE: IntGauge = IntGauge::new( - "committor_intent_cu_usage", "Compute units used for Intent" + "committor_intent_cu_usage_gauge", "Compute units used for Intent" + ).unwrap(); + + // GetMultiplAccount investigation + static ref REMOTE_ACCOUNT_PROVIDER_A_COUNT: IntCounter = IntCounter::new( + "remote_account_provider_a_count", "Get mupltiple account count" + ).unwrap(); + + static ref TASK_INFO_FETCHER_A_COUNT: IntCounter = IntCounter::new( + "task_info_fetcher_a_count", "Get mupltiple account count" + ).unwrap(); + + static ref TABLE_MANIA_A_COUNT: IntCounter = IntCounter::new( + "table_mania_a_count", "Get mupltiple account count" + ).unwrap(); + + static ref TABLE_MANIA_CLOSED_A_COUNT: IntCounter = IntCounter::new( + "table_mania_closed_a_count", "Get account counter" ).unwrap(); } @@ -250,7 +316,6 @@ pub(crate) fn register() { register!(ACCOUNTS_COUNT_GAUGE); register!(PENDING_ACCOUNT_CLONES_GAUGE); register!(MONITORED_ACCOUNTS_GAUGE); - register!(SUBSCRIPTIONS_COUNT_GAUGE); register!(EVICTED_ACCOUNTS_COUNT); register!(COMMITTOR_INTENTS_BACKLOG_COUNT); register!(COMMITTOR_FAILED_INTENTS_COUNT); @@ -263,7 +328,18 @@ pub(crate) fn register() { register!(TRANSACTION_SKIP_PREFLIGHT); register!(RPC_REQUESTS_COUNT); register!(RPC_WS_SUBSCRIPTIONS_COUNT); + register!(ACCOUNT_FETCHES_SUCCESS_COUNT); + register!(ACCOUNT_FETCHES_FAILED_COUNT); + register!(ACCOUNT_FETCHES_FOUND_COUNT); + register!(ACCOUNT_FETCHES_NOT_FOUND_COUNT); + register!(UNDELEGATION_REQUESTED_COUNT); + register!(UNDELEGATION_COMPLETED_COUNT); + register!(UNSTUCK_UNDELEGATION_COUNT); register!(FAILED_TRANSACTIONS_COUNT); + register!(REMOTE_ACCOUNT_PROVIDER_A_COUNT); + register!(TASK_INFO_FETCHER_A_COUNT); + register!(TABLE_MANIA_A_COUNT); + register!(TABLE_MANIA_CLOSED_A_COUNT); }); } @@ -275,12 +351,6 @@ pub fn set_cached_clone_outputs_count(count: usize) { CACHED_CLONE_OUTPUTS_COUNT.set(count as i64); } -pub fn set_subscriptions_count(count: usize, shard: &str) { - SUBSCRIPTIONS_COUNT_GAUGE - .with_label_values(&[shard]) - .set(count as i64); -} - pub fn set_ledger_size(size: u64) { LEDGER_SIZE_GAUGE.set(size as i64); } @@ -356,7 +426,11 @@ pub fn ensure_accounts_end(timer: HistogramTimer) { timer.stop_and_record(); } -pub fn adjust_monitored_accounts_count(count: usize) { +/// Sets the absolute number of monitored accounts. +/// +/// This metric reflects the current total count of accounts being monitored. +/// Callers must pass the total number of monitored accounts, not a delta. +pub fn set_monitored_accounts_count(count: usize) { MONITORED_ACCOUNTS_GAUGE.set(count as i64); } pub fn inc_evicted_accounts_count() { @@ -393,3 +467,47 @@ pub fn observe_committor_intent_execution_time_histogram( pub fn set_commmittor_intent_cu_usage(value: i64) { COMMITTOR_INTENT_CU_USAGE.set(value) } + +pub fn inc_account_fetches_success(count: u64) { + ACCOUNT_FETCHES_SUCCESS_COUNT.inc_by(count); +} + +pub fn inc_account_fetches_failed(count: u64) { + ACCOUNT_FETCHES_FAILED_COUNT.inc_by(count); +} + +pub fn inc_account_fetches_found(count: u64) { + ACCOUNT_FETCHES_FOUND_COUNT.inc_by(count); +} + +pub fn inc_account_fetches_not_found(count: u64) { + ACCOUNT_FETCHES_NOT_FOUND_COUNT.inc_by(count); +} + +pub fn inc_undelegation_requested() { + UNDELEGATION_REQUESTED_COUNT.inc(); +} + +pub fn inc_undelegation_completed() { + UNDELEGATION_COMPLETED_COUNT.inc(); +} + +pub fn inc_unstuck_undelegation_count() { + UNSTUCK_UNDELEGATION_COUNT.inc(); +} + +pub fn inc_remote_account_provider_a_count() { + REMOTE_ACCOUNT_PROVIDER_A_COUNT.inc() +} + +pub fn inc_task_info_fetcher_a_count() { + TASK_INFO_FETCHER_A_COUNT.inc() +} + +pub fn inc_table_mania_a_count() { + TABLE_MANIA_A_COUNT.inc() +} + +pub fn inc_table_mania_close_a_count() { + TABLE_MANIA_CLOSED_A_COUNT.inc() +} diff --git a/magicblock-metrics/src/service.rs b/magicblock-metrics/src/service.rs index 4e2b08623..25f0627f9 100644 --- a/magicblock-metrics/src/service.rs +++ b/magicblock-metrics/src/service.rs @@ -112,7 +112,7 @@ async fn metrics_service_router( .unwrap_or_default(), ); } - match (req.method(), req.uri().path()) { + let result = match (req.method(), req.uri().path()) { (&Method::GET, "/metrics") => { let metrics = TextEncoder::new() .encode_to_string(&metrics::REGISTRY.gather()) @@ -127,7 +127,14 @@ async fn metrics_service_router( *not_found.status_mut() = StatusCode::NOT_FOUND; Ok(not_found) } - } + }; + // We must consume the body fully to keep the connection alive. We + // iterate over all chunks and simply drop them. This prevents garbage + // data of previous requests from being stuck in connection buffer. + let mut body = req.into_body(); + while (body.frame().await).is_some() {} + + result } fn full>(chunk: T) -> BoxBody { diff --git a/magicblock-processor/Cargo.toml b/magicblock-processor/Cargo.toml index 8aa007057..1cde50705 100644 --- a/magicblock-processor/Cargo.toml +++ b/magicblock-processor/Cargo.toml @@ -41,6 +41,7 @@ solana-transaction-error = { workspace = true } [dev-dependencies] guinea = { workspace = true } +solana-keypair = { workspace = true } solana-signature = { workspace = true } solana-signer = { workspace = true } test-kit = { workspace = true } diff --git a/magicblock-processor/src/executor/processing.rs b/magicblock-processor/src/executor/processing.rs index 641fda1d7..c1e6a9ce2 100644 --- a/magicblock-processor/src/executor/processing.rs +++ b/magicblock-processor/src/executor/processing.rs @@ -1,4 +1,4 @@ -use log::error; +use log::*; use magicblock_core::{ link::{ accounts::{AccountWithSlot, LockedAccount}, @@ -10,6 +10,7 @@ use magicblock_core::{ tls::ExecutionTlsStash, }; use magicblock_metrics::metrics::FAILED_TRANSACTIONS_COUNT; +use solana_account::ReadableAccount; use solana_pubkey::Pubkey; use solana_svm::{ account_loader::{AccountsBalances, CheckedTransactionDetails}, @@ -20,7 +21,7 @@ use solana_svm::{ }; use solana_svm_transaction::svm_message::SVMMessage; use solana_transaction::sanitized::SanitizedTransaction; -use solana_transaction_error::TransactionResult; +use solana_transaction_error::{TransactionError, TransactionResult}; use solana_transaction_status::{ map_inner_instructions, TransactionStatusMeta, }; @@ -167,9 +168,40 @@ impl super::TransactionExecutor { // SAFETY: // we passed a single transaction for execution, and // we will get a guaranteed single result back. - let result = output.processing_results.pop().expect( + let mut result = output.processing_results.pop().expect( "single transaction result is always present in the output", ); + + let gasless = self.environment.fee_lamports_per_signature == 0; + // If we are running in the gasless mode, we should not allow + // any mutation of the feepayer account, since that would make + // it possible for malicious actors to perform transfer operations + // from undelegated feepayers to delegated accounts, which would + // result in validator losing funds upon balance settling. + if gasless { + let undelegated_feepayer_was_modified = result + .as_ref() + .ok() + .and_then(|r| r.executed_transaction()) + .and_then(|txn| { + let first_acc = txn.loaded_transaction.accounts.first(); + let rollback_lamports = rollback_feepayer_lamports( + &txn.loaded_transaction.rollback_accounts, + ); + first_acc.map(|acc| (acc, rollback_lamports)) + }) + .map(|(acc, rollback_lamports)| { + (acc.1.is_dirty() + && (acc.1.lamports() != 0 || rollback_lamports != 0)) + && !acc.1.delegated() + && !acc.1.privileged() + }) + .unwrap_or(false); + + if undelegated_feepayer_was_modified { + result = Err(TransactionError::InvalidAccountForFee); + } + } (result, output.balances) } @@ -298,10 +330,23 @@ impl super::TransactionExecutor { } }; + // The first loaded account is always a feepayer, check + // whether we are running in privileged execution mode + let privileged = accounts + .first() + .map(|feepayer| feepayer.1.privileged()) + .unwrap_or_default(); + for (pubkey, account) in accounts { // only persist account's update if it was actually modified, ignore - // the rest, even if an account was writeable in the transaction - if !account.is_dirty() { + // the rest, even if an account was writeable in the transaction. + // + // We also don't persist accounts that are empty, with an exception + // for special cases, when those are inserted forcefully as placeholders + // (for example by the chainlink), those cases can be distinguished from + // others by the fact that such a transaction is always running in a + // privileged mode. + if !account.is_dirty() || (account.lamports() == 0 && !privileged) { continue; } self.accountsdb.insert_account(pubkey, account); @@ -317,3 +362,19 @@ impl super::TransactionExecutor { } } } + +// A utils to extract the rollback lamports of the feepayer +fn rollback_feepayer_lamports(rollback: &RollbackAccounts) -> u64 { + match rollback { + RollbackAccounts::FeePayerOnly { fee_payer_account } => { + fee_payer_account.lamports() + } + RollbackAccounts::SameNonceAndFeePayer { nonce } => { + nonce.account().lamports() + } + RollbackAccounts::SeparateNonceAndFeePayer { + fee_payer_account, + .. + } => fee_payer_account.lamports(), + } +} diff --git a/magicblock-processor/tests/fees.rs b/magicblock-processor/tests/fees.rs index ca559dfd1..3c1898313 100644 --- a/magicblock-processor/tests/fees.rs +++ b/magicblock-processor/tests/fees.rs @@ -1,7 +1,9 @@ use std::{collections::HashSet, time::Duration}; use guinea::GuineaInstruction; +use magicblock_core::traits::AccountsBank; use solana_account::{ReadableAccount, WritableAccount}; +use solana_keypair::Keypair; use solana_program::{ instruction::{AccountMeta, Instruction}, native_token::LAMPORTS_PER_SOL, @@ -307,3 +309,100 @@ async fn test_transaction_gasless_mode() { "payer balance should not change in gasless mode" ); } + +/// Verifies that in zero-fee ("gasless") mode, transactions are processed +/// successfully when using a not existing accounts (not the feepayer). +#[tokio::test] +async fn test_transaction_gasless_mode_with_not_existing_account() { + // Initialize the environment with a base fee of 0. + let env = ExecutionTestEnv::new_with_fee(0); + let mut payer = env.get_payer(); + payer.set_lamports(1); // Not enough to cover standard fee + payer.set_delegated(false); // Explicitly set the payer as NON-delegated. + let initial_balance = payer.lamports(); + payer.commmit(); + + let ix = Instruction::new_with_bincode( + guinea::ID, + &GuineaInstruction::PrintSizes, + vec![AccountMeta { + pubkey: Keypair::new().pubkey(), + is_signer: false, + is_writable: false, + }], + ); + let txn = env.build_transaction(&[ix]); + let signature = txn.signatures[0]; + + // In a normal fee-paying mode, this execution would fail. + env.execute_transaction(txn) + .await + .expect("transaction should succeed in gasless mode"); + + // Verify the transaction was fully processed and broadcast successfully. + let status = env + .dispatch + .transaction_status + .recv_timeout(Duration::from_millis(100)) + .expect("should receive a transaction status update"); + + assert_eq!(status.signature, signature); + assert!( + status.result.result.is_ok(), + "Transaction execution should be successful" + ); + + // Verify that absolutely no fee was charged. + let final_balance = env.get_payer().lamports(); + assert_eq!( + initial_balance, final_balance, + "payer balance should not change in gasless mode" + ); +} + +/// Verifies that in zero-fee ("gasless") mode, transactions are processed +/// successfully even when the fee payer does not exists. +#[tokio::test] +async fn test_transaction_gasless_mode_not_existing_feepayer() { + // Initialize the environment with a base fee of 0. + let payer = Keypair::new(); + let env = ExecutionTestEnv::new_with_payer_and_fees(&payer, 0); + + // Simple noop instruction that does not touch the fee payer account + let ix = Instruction::new_with_bincode( + guinea::ID, + &GuineaInstruction::PrintSizes, + vec![], + ); + let txn = env.build_transaction(&[ix]); + let signature = txn.signatures[0]; + + // In a normal fee-paying mode, this execution would fail. + env.execute_transaction(txn) + .await + .expect("transaction should succeed in gasless mode"); + + // Verify the transaction was fully processed and broadcast successfully. + let status = env + .dispatch + .transaction_status + .recv_timeout(Duration::from_millis(100)) + .expect("should receive a transaction status update"); + + assert_eq!(status.signature, signature); + assert!( + status.result.result.is_ok(), + "Transaction execution should be successful" + ); + + // Verify that the payer balance is zero (or doesn't exist) + let final_balance = env + .accountsdb + .get_account(&payer.pubkey()) + .unwrap_or_default() + .lamports(); + assert_eq!( + final_balance, 0, + "payer balance of a not existing feepayer should be 0 in gasless mode" + ); +} diff --git a/magicblock-table-mania/Cargo.toml b/magicblock-table-mania/Cargo.toml index 5cca6e5f8..c1a4fb009 100644 --- a/magicblock-table-mania/Cargo.toml +++ b/magicblock-table-mania/Cargo.toml @@ -14,6 +14,7 @@ doctest = false ed25519-dalek = { workspace = true } log = { workspace = true } magicblock-rpc-client = { workspace = true } +magicblock-metrics = { workspace = true } rand = { workspace = true } sha3 = { workspace = true } solana-pubkey = { workspace = true } diff --git a/magicblock-table-mania/src/lookup_table_rc.rs b/magicblock-table-mania/src/lookup_table_rc.rs index 8220ecd8b..2a245c387 100644 --- a/magicblock-table-mania/src/lookup_table_rc.rs +++ b/magicblock-table-mania/src/lookup_table_rc.rs @@ -9,6 +9,7 @@ use std::{ }; use log::*; +use magicblock_metrics::metrics; use magicblock_rpc_client::{ MagicBlockRpcClientError, MagicBlockSendTransactionConfig, MagicblockRpcClient, @@ -696,6 +697,7 @@ impl LookupTableRc { &self, rpc_client: &MagicblockRpcClient, ) -> TableManiaResult { + metrics::inc_table_mania_close_a_count(); let acc = rpc_client.get_account(self.table_address()).await?; Ok(acc.is_none()) } diff --git a/magicblock-table-mania/src/manager.rs b/magicblock-table-mania/src/manager.rs index 4901ccd7a..473c5db10 100644 --- a/magicblock-table-mania/src/manager.rs +++ b/magicblock-table-mania/src/manager.rs @@ -8,6 +8,7 @@ use std::{ }; use log::*; +use magicblock_metrics::metrics; use magicblock_rpc_client::MagicblockRpcClient; use solana_pubkey::Pubkey; use solana_sdk::{ @@ -526,6 +527,7 @@ impl TableMania { .join(", "); loop { + metrics::inc_table_mania_a_count(); // Fetch the tables from chain let remote_table_accs = self .rpc_client diff --git a/programs/magicblock/src/schedule_transactions/process_schedule_base_intent.rs b/programs/magicblock/src/schedule_transactions/process_schedule_base_intent.rs index 9d72075e6..5c9ca050c 100644 --- a/programs/magicblock/src/schedule_transactions/process_schedule_base_intent.rs +++ b/programs/magicblock/src/schedule_transactions/process_schedule_base_intent.rs @@ -145,6 +145,7 @@ pub(crate) fn process_schedule_base_intent( .into_iter() .for_each(|(_, account_ref)| { set_account_owner_to_delegation_program(account_ref); + account_ref.borrow_mut().set_undelegating(true); }); } diff --git a/test-integration/Cargo.lock b/test-integration/Cargo.lock index 2bc260861..a161243a7 100644 --- a/test-integration/Cargo.lock +++ b/test-integration/Cargo.lock @@ -2935,6 +2935,8 @@ dependencies = [ "solana-transaction-status", "tempfile", "toml 0.8.23", + "ureq", + "url 2.5.4", ] [[package]] @@ -3612,6 +3614,7 @@ dependencies = [ name = "magicblock-chainlink" version = "0.2.3" dependencies = [ + "arc-swap", "async-trait", "bincode", "env_logger 0.11.8", @@ -3621,6 +3624,7 @@ dependencies = [ "magicblock-core", "magicblock-delegation-program", "magicblock-magic-program-api 0.2.3", + "magicblock-metrics", "serde_json", "solana-account", "solana-account-decoder", @@ -3900,6 +3904,7 @@ version = "0.2.3" dependencies = [ "ed25519-dalek", "log", + "magicblock-metrics", "magicblock-rpc-client", "rand 0.8.5", "sha3", @@ -5627,6 +5632,7 @@ version = "0.23.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" dependencies = [ + "log", "once_cell", "ring", "rustls-pki-types", @@ -5923,10 +5929,11 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ + "serde_core", "serde_derive", ] @@ -5948,11 +5955,20 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + [[package]] name = "serde_derive" -version = "1.0.219" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -6244,7 +6260,7 @@ dependencies = [ [[package]] name = "solana-account" version = "2.2.1" -source = "git+https://github.com/magicblock-labs/solana-account.git?rev=f454d4a#f454d4a67a1ca64b87002025868f5369428e1c54" +source = "git+https://github.com/magicblock-labs/solana-account.git?rev=731fa50#731fa5037bf89929da76759f2281c1cb4833a8b7" dependencies = [ "bincode", "qualifier_attr", @@ -11187,6 +11203,22 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "once_cell", + "rustls 0.23.28", + "rustls-pki-types", + "url 2.5.4", + "webpki-roots 0.26.11", +] + [[package]] name = "uriparse" version = "0.6.4" @@ -11451,6 +11483,24 @@ version = "0.25.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.4", +] + +[[package]] +name = "webpki-roots" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2878ef029c47c6e8cf779119f20fcf52bde7ad42a731b2a304bc221df17571e" +dependencies = [ + "rustls-pki-types", +] + [[package]] name = "which" version = "4.4.2" diff --git a/test-integration/Cargo.toml b/test-integration/Cargo.toml index 9720dd913..c224fc024 100644 --- a/test-integration/Cargo.toml +++ b/test-integration/Cargo.toml @@ -74,7 +74,7 @@ rayon = "1.10.0" schedulecommit-client = { path = "schedulecommit/client" } serde = "1.0.217" serial_test = "3.2.0" -solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "f454d4a" } +solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "731fa50" } solana-loader-v2-interface = "2.2" solana-loader-v3-interface = "4.0" solana-loader-v4-interface = "2.1" @@ -97,6 +97,8 @@ test-ledger-restore = { path = "./test-ledger-restore" } test-kit = { path = "../test-kit" } tokio = "1.0" toml = "0.8.13" +ureq = "2.9.6" +url = "2.5.0" [patch.crates-io] # some solana dependencies have solana-storage-proto as dependency @@ -104,4 +106,4 @@ toml = "0.8.13" # and we use protobuf-src v2.1.1. Otherwise compilation fails solana-storage-proto = { path = "../storage-proto" } # same reason as above -solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "f454d4a" } +solana-account = { git = "https://github.com/magicblock-labs/solana-account.git", rev = "731fa50" } diff --git a/test-integration/test-chainlink/src/ixtest_context.rs b/test-integration/test-chainlink/src/ixtest_context.rs index 8053eee75..bb5ca5e51 100644 --- a/test-integration/test-chainlink/src/ixtest_context.rs +++ b/test-integration/test-chainlink/src/ixtest_context.rs @@ -140,6 +140,7 @@ impl IxtestContext { fetch_cloner, validator_kp.pubkey(), faucet_kp.pubkey(), + 0, ) .unwrap(); diff --git a/test-integration/test-chainlink/src/test_context.rs b/test-integration/test-chainlink/src/test_context.rs index 7c9bbad55..a90d3d986 100644 --- a/test-integration/test-chainlink/src/test_context.rs +++ b/test-integration/test-chainlink/src/test_context.rs @@ -67,14 +67,18 @@ impl TestContext { let faucet_pubkey = Pubkey::new_unique(); let (fetch_cloner, remote_account_provider) = { let (tx, rx) = tokio::sync::mpsc::channel(100); + let config = RemoteAccountProviderConfig::try_new_with_metrics( + 1000, // subscribed_accounts_lru_capacity + lifecycle_mode, + false, // disable subscription metrics + ) + .unwrap(); let remote_account_provider = RemoteAccountProvider::try_from_clients_and_mode( rpc_client.clone(), pubsub_client.clone(), tx, - &RemoteAccountProviderConfig::default_with_lifecycle_mode( - lifecycle_mode, - ), + &config, ) .await; @@ -105,6 +109,7 @@ impl TestContext { fetch_cloner, validator_pubkey, faucet_pubkey, + 0, ) .unwrap(); Self { diff --git a/test-integration/test-chainlink/tests/chain_pubsub_actor.rs b/test-integration/test-chainlink/tests/chain_pubsub_actor.rs index 087eab526..66c2b9c08 100644 --- a/test-integration/test-chainlink/tests/chain_pubsub_actor.rs +++ b/test-integration/test-chainlink/tests/chain_pubsub_actor.rs @@ -2,7 +2,7 @@ use magicblock_chainlink::{ remote_account_provider::SubscriptionUpdate, testing::{ chain_pubsub::{ - recycle, setup_actor_and_client, subscribe, unsubscribe, + reconnect, setup_actor_and_client, subscribe, unsubscribe, }, utils::{airdrop, init_logger, random_pubkey}, }, @@ -90,9 +90,16 @@ async fn ixtest_recycle_connections() { .await; // 5. Recycle connections - recycle(&actor).await; + reconnect(&actor).await; - // 6. Airdrop again and ensure we receive the update again + // 6. Airdrop again and ensure we don't yet receive the update + airdrop(&rpc_client, &pubkey, 2_500_000).await; + expect_no_update_for(&mut updates_rx, pubkey, 1500).await; + + // 6. Resubscribe to the account + subscribe(&actor, pubkey).await; + + // 7. Airdrop again and ensure we receive the update again let _second_update = airdrop_and_expect_update( &rpc_client, &mut updates_rx, @@ -144,7 +151,20 @@ async fn ixtest_recycle_connections_multiple_accounts() { unsubscribe(&actor, unsub_pk).await; // Recycle connections - recycle(&actor).await; + reconnect(&actor).await; + + // Airdrop to each and ensure we receiive no updates yet + for &pk in &pks { + airdrop(&rpc_client, &pk, 2_500_000).await; + } + for &pk in &pks { + expect_no_update_for(&mut updates_rx, pk, 1500).await; + } + + // Resubscribe to first three + for &pk in &pks[0..3] { + subscribe(&actor, pk).await; + } // Airdrop to first three and expect updates for &pk in &pks[0..3] { diff --git a/test-integration/test-chainlink/tests/chain_pubsub_client.rs b/test-integration/test-chainlink/tests/chain_pubsub_client.rs index f34c011b4..21ebbcea1 100644 --- a/test-integration/test-chainlink/tests/chain_pubsub_client.rs +++ b/test-integration/test-chainlink/tests/chain_pubsub_client.rs @@ -23,8 +23,10 @@ use tokio::{sync::mpsc, task}; async fn setup() -> (ChainPubsubClientImpl, mpsc::Receiver) { init_logger(); + let (tx, _) = mpsc::channel(10); let client = ChainPubsubClientImpl::try_new_from_url( PUBSUB_URL, + tx, CommitmentConfig::confirmed(), ) .await diff --git a/test-integration/test-chainlink/tests/ix_06_redeleg_us_separate_slots.rs b/test-integration/test-chainlink/tests/ix_06_redeleg_us_separate_slots.rs index 052e6bee6..f31017dac 100644 --- a/test-integration/test-chainlink/tests/ix_06_redeleg_us_separate_slots.rs +++ b/test-integration/test-chainlink/tests/ix_06_redeleg_us_separate_slots.rs @@ -5,7 +5,7 @@ use log::*; use magicblock_chainlink::{ - assert_cloned_as_delegated, assert_cloned_as_undelegated, + assert_cloned_as_delegated_with_retries, assert_cloned_as_undelegated, assert_not_subscribed, assert_subscribed_without_delegation_record, testing::init_logger, }; @@ -34,14 +34,16 @@ async fn ixtest_undelegate_redelegate_to_us_in_separate_slots() { info!("1. Account delegated to us"); ctx.chainlink.ensure_accounts(&pubkeys, None).await.unwrap(); + sleep_ms(1_500).await; // Account should be cloned as delegated let account = ctx.cloner.get_account(&counter_pda).unwrap(); - assert_cloned_as_delegated!( + assert_cloned_as_delegated_with_retries!( ctx.cloner, &[counter_pda], account.remote_slot(), - program_flexi_counter::id() + program_flexi_counter::id(), + 30 ); // Accounts delegated to us should not be tracked via subscription @@ -58,6 +60,7 @@ async fn ixtest_undelegate_redelegate_to_us_in_separate_slots() { ); ctx.undelegate_counter(&counter_auth, false).await; + sleep_ms(1_500).await; // Account should be cloned as undelegated (owned by program again) let account = ctx.cloner.get_account(&counter_pda).unwrap(); @@ -75,15 +78,16 @@ async fn ixtest_undelegate_redelegate_to_us_in_separate_slots() { { info!("3. Account redelegated to us - Would allow write"); ctx.delegate_counter(&counter_auth).await; - sleep_ms(500).await; + sleep_ms(1_500).await; // Account should be cloned as delegated back to us let account = ctx.cloner.get_account(&counter_pda).unwrap(); - assert_cloned_as_delegated!( + assert_cloned_as_delegated_with_retries!( ctx.cloner, &[counter_pda], account.remote_slot(), - program_flexi_counter::id() + program_flexi_counter::id(), + 30 ); // Accounts delegated to us should not be tracked via subscription diff --git a/test-integration/test-chainlink/tests/ix_07_redeleg_us_same_slot.rs b/test-integration/test-chainlink/tests/ix_07_redeleg_us_same_slot.rs index 68b8e7be5..984d1c3d1 100644 --- a/test-integration/test-chainlink/tests/ix_07_redeleg_us_same_slot.rs +++ b/test-integration/test-chainlink/tests/ix_07_redeleg_us_same_slot.rs @@ -5,7 +5,8 @@ use log::*; use magicblock_chainlink::{ - assert_cloned_as_delegated, assert_not_subscribed, testing::init_logger, + assert_cloned_as_delegated, assert_not_subscribed, + testing::{init_logger, utils::sleep_ms}, }; use solana_sdk::{signature::Keypair, signer::Signer}; use test_chainlink::ixtest_context::IxtestContext; @@ -32,6 +33,7 @@ async fn ixtest_undelegate_redelegate_to_us_in_same_slot() { info!("1. Account delegated to us"); ctx.chainlink.ensure_accounts(&pubkeys, None).await.unwrap(); + sleep_ms(1_500).await; // Account should be cloned as delegated let account = ctx.cloner.get_account(&counter_pda).unwrap(); @@ -57,6 +59,9 @@ async fn ixtest_undelegate_redelegate_to_us_in_same_slot() { ctx.undelegate_counter(&counter_auth, true).await; + // Wait for pubsub update to trigger subscription handler + sleep_ms(1_500).await; + // Account should still be cloned as delegated to us let account = ctx.cloner.get_account(&counter_pda).unwrap(); assert_cloned_as_delegated!( diff --git a/test-integration/test-chainlink/tests/ix_exceed_capacity.rs b/test-integration/test-chainlink/tests/ix_exceed_capacity.rs index 44c2d69c6..cc76a94c4 100644 --- a/test-integration/test-chainlink/tests/ix_exceed_capacity.rs +++ b/test-integration/test-chainlink/tests/ix_exceed_capacity.rs @@ -11,9 +11,10 @@ async fn setup( pubkeys_len: usize, ) -> (IxtestContext, Vec) { let config = { - let rap_config = RemoteAccountProviderConfig::try_new( + let rap_config = RemoteAccountProviderConfig::try_new_with_metrics( subscribed_accounts_lru_capacity, LifecycleMode::Ephemeral, + false, ) .unwrap(); ChainlinkConfig::new(rap_config) diff --git a/test-integration/test-chainlink/tests/ix_remote_account_provider.rs b/test-integration/test-chainlink/tests/ix_remote_account_provider.rs index 47534ab03..cdd5e6ff8 100644 --- a/test-integration/test-chainlink/tests/ix_remote_account_provider.rs +++ b/test-integration/test-chainlink/tests/ix_remote_account_provider.rs @@ -4,7 +4,8 @@ use magicblock_chainlink::{ remote_account_provider::{ chain_pubsub_client::ChainPubsubClientImpl, chain_rpc_client::ChainRpcClientImpl, - config::RemoteAccountProviderConfig, Endpoint, RemoteAccountProvider, + config::RemoteAccountProviderConfig, Endpoint, + ForwardedSubscriptionUpdate, RemoteAccountProvider, RemoteAccountUpdateSource, }, submux::SubMuxClient, @@ -21,35 +22,45 @@ use solana_rpc_client_api::{ use solana_sdk::commitment_config::CommitmentConfig; use tokio::sync::mpsc; -async fn init_remote_account_provider() -> RemoteAccountProvider< - ChainRpcClientImpl, - SubMuxClient, -> { - let (fwd_tx, _fwd_rx) = mpsc::channel(100); +async fn init_remote_account_provider() -> ( + RemoteAccountProvider< + ChainRpcClientImpl, + SubMuxClient, + >, + mpsc::Receiver, +) { + let (fwd_tx, fwd_rx) = mpsc::channel(100); let endpoints = [Endpoint { rpc_url: RPC_URL.to_string(), pubsub_url: PUBSUB_URL.to_string(), }]; - RemoteAccountProvider::< - ChainRpcClientImpl, - SubMuxClient, - >::try_new_from_urls( - &endpoints, - CommitmentConfig::confirmed(), - fwd_tx, - &RemoteAccountProviderConfig::default_with_lifecycle_mode( - LifecycleMode::Ephemeral, - ), + ( + RemoteAccountProvider::< + ChainRpcClientImpl, + SubMuxClient, + >::try_new_from_urls( + &endpoints, + CommitmentConfig::confirmed(), + fwd_tx, + &RemoteAccountProviderConfig::try_new_with_metrics( + 1000, + LifecycleMode::Ephemeral, + false, + ) + .unwrap(), + ) + .await + .unwrap(), + fwd_rx, ) - .await - .unwrap() } #[tokio::test] async fn ixtest_get_non_existing_account() { init_logger(); - let remote_account_provider = init_remote_account_provider().await; + let (remote_account_provider, _fwd_rx) = + init_remote_account_provider().await; let pubkey = random_pubkey(); let remote_account = remote_account_provider.try_get(pubkey).await.unwrap(); @@ -60,7 +71,8 @@ async fn ixtest_get_non_existing_account() { async fn ixtest_existing_account_for_future_slot() { init_logger(); - let remote_account_provider = init_remote_account_provider().await; + let (remote_account_provider, _fwd_rx) = + init_remote_account_provider().await; let pubkey = random_pubkey(); let rpc_client = remote_account_provider.rpc_client(); @@ -95,7 +107,8 @@ async fn ixtest_existing_account_for_future_slot() { async fn ixtest_get_existing_account_for_valid_slot() { init_logger(); - let remote_account_provider = init_remote_account_provider().await; + let (remote_account_provider, _fwd_rx) = + init_remote_account_provider().await; let pubkey = random_pubkey(); let rpc_client = remote_account_provider.rpc_client(); @@ -131,7 +144,8 @@ async fn ixtest_get_existing_account_for_valid_slot() { async fn ixtest_get_multiple_accounts_for_valid_slot() { init_logger(); - let remote_account_provider = init_remote_account_provider().await; + let (remote_account_provider, _fwd_rx) = + init_remote_account_provider().await; let (pubkey1, pubkey2, pubkey3, pubkey4) = ( random_pubkey(), @@ -141,15 +155,9 @@ async fn ixtest_get_multiple_accounts_for_valid_slot() { ); let rpc_client = remote_account_provider.rpc_client(); - airdrop(rpc_client, &pubkey1, 1_000_000).await; - airdrop(rpc_client, &pubkey2, 2_000_000).await; - airdrop(rpc_client, &pubkey3, 3_000_000).await; - let all_pubkeys = vec![pubkey1, pubkey2, pubkey3, pubkey4]; { - // Fetching immediately does not return the accounts yet - // They are updated via subscriptions instead let remote_accounts = remote_account_provider .try_get_multi(&all_pubkeys, None) .await @@ -168,6 +176,10 @@ async fn ixtest_get_multiple_accounts_for_valid_slot() { ); } + airdrop(rpc_client, &pubkey1, 1_000_000).await; + airdrop(rpc_client, &pubkey2, 2_000_000).await; + airdrop(rpc_client, &pubkey3, 3_000_000).await; + sleep_ms(500).await; await_next_slot(rpc_client).await; diff --git a/test-integration/test-cloning/tests/01_program-deploy.rs b/test-integration/test-cloning/tests/01_program-deploy.rs index 9ac56f282..11ecb5346 100644 --- a/test-integration/test-cloning/tests/01_program-deploy.rs +++ b/test-integration/test-cloning/tests/01_program-deploy.rs @@ -183,7 +183,7 @@ async fn test_clone_mini_v4_loader_program_and_upgrade() { loop { ctx.wait_for_delta_slot_ephem(5).unwrap(); - let bump = (remaining_retries - MAX_RETRIES) + 1; + let bump = MAX_RETRIES.saturating_sub(remaining_retries) + 1; let msg = format!("Hola Mundo {bump}"); let ix = sdk.log_msg_instruction(&payer.pubkey(), &msg); let (sig, found) = ctx diff --git a/test-integration/test-cloning/tests/04_escrow_transfer.rs b/test-integration/test-cloning/tests/04_escrow_transfer.rs index fdf436b21..35c617063 100644 --- a/test-integration/test-cloning/tests/04_escrow_transfer.rs +++ b/test-integration/test-cloning/tests/04_escrow_transfer.rs @@ -1,14 +1,47 @@ use integration_test_tools::IntegrationTestContext; use log::*; use solana_sdk::{ - native_token::LAMPORTS_PER_SOL, signature::Keypair, signer::Signer, - system_instruction, + native_token::LAMPORTS_PER_SOL, pubkey::Pubkey, signature::Keypair, + signer::Signer, system_instruction, }; use test_kit::init_logger; use crate::utils::init_and_delegate_flexi_counter; mod utils; +fn log_accounts_balances( + ctx: &IntegrationTestContext, + stage: &str, + counter: &Pubkey, + payer: &Pubkey, + escrow: &Pubkey, +) -> (u64, u64, u64) { + let accs = ctx + .fetch_ephem_multiple_accounts(&[*counter, *payer, *escrow]) + .unwrap(); + let [counter_acc, payer_acc, escrow_acc] = accs.as_slice() else { + panic!("Expected 3 accounts, got {:#?}", accs); + }; + + let counter_balance = + counter_acc.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; + let payer_balance = + payer_acc.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; + let escrow_balance = + escrow_acc.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; + debug!("--- {stage} ---"); + debug!("Counter {counter}: {counter_balance} SOL"); + debug!("Payer {payer}: {payer_balance} SOL"); + debug!("Escrow {escrow} {escrow_balance} SOL"); + + ( + counter_acc.as_ref().unwrap().lamports, + payer_acc.as_ref().unwrap().lamports, + escrow_acc.as_ref().unwrap().lamports, + ) +} + +#[ignore = "We are still evaluating escrow functionality that allows anything except just paying fees"] #[test] fn test_transfer_from_escrow_to_delegated_account() { init_logger!(); @@ -29,14 +62,14 @@ fn test_transfer_from_escrow_to_delegated_account() { .airdrop_chain_escrowed(&kp_escrowed, 2 * LAMPORTS_PER_SOL) .unwrap(); - assert_eq!( - ctx.fetch_ephem_account(ephemeral_balance_pda) - .unwrap() - .lamports, - escrow_lamports + let (_, _, ephem_escrow_lamports) = log_accounts_balances( + &ctx, + "After delegation and escrowed airdrop", + &counter_pda, + &kp_escrowed.pubkey(), + &ephemeral_balance_pda, ); - - debug!("{:#?}", ctx.fetch_ephem_account(counter_pda).unwrap()); + assert_eq!(ephem_escrow_lamports, escrow_lamports); // 2. Transfer 0.5 SOL from kp1 to counter pda let transfer_amount = LAMPORTS_PER_SOL / 2; @@ -52,36 +85,21 @@ fn test_transfer_from_escrow_to_delegated_account() { ) .unwrap(); - debug!("Transfer tx: {sig} {confirmed}"); + debug!("Transfer tx sig: {sig} ({confirmed}) "); // 3. Check balances - let accs = ctx - .fetch_ephem_multiple_accounts(&[ - kp_escrowed.pubkey(), - ephemeral_balance_pda, - counter_pda, - ]) - .unwrap(); - let [escrowed, escrow, counter] = accs.as_slice() else { - panic!("Expected 3 accounts, got {:#?}", accs); - }; - - debug!("Escrowed : '{}': {escrowed:#?}", kp_escrowed.pubkey()); - debug!("Escrow : '{ephemeral_balance_pda}': {escrow:#?}"); - debug!("Counter : '{counter_pda}': {counter:#?}"); - - let escrowed_balance = - escrowed.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; - let escrow_balance = - escrow.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; - let counter_balance = - counter.as_ref().unwrap().lamports as f64 / LAMPORTS_PER_SOL as f64; - - debug!( - "\nEscrowed balance: {escrowed_balance}\nEscrow balance : {escrow_balance}\nCounter balance : {counter_balance}" + let (counter_balance, _, escrow_balance) = log_accounts_balances( + &ctx, + "After transfer from escrow to counter", + &counter_pda, + &kp_escrowed.pubkey(), + &ephemeral_balance_pda, ); + let escrow_balance = escrow_balance as f64 / LAMPORTS_PER_SOL as f64; + let counter_balance = counter_balance as f64 / LAMPORTS_PER_SOL as f64; + // Received 1 SOL then transferred 0.5 SOL + tx fee - assert!((0.4..=0.5).contains(&escrowed_balance)); + assert!((0.4..=0.5).contains(&escrow_balance)); // Airdropped 2 SOL - escrowed half assert!(escrow_balance >= 1.0); // Received 0.5 SOL diff --git a/test-integration/test-cloning/tests/05_parallel-cloning.rs b/test-integration/test-cloning/tests/05_parallel-cloning.rs index d0560783a..023fe2a24 100644 --- a/test-integration/test-cloning/tests/05_parallel-cloning.rs +++ b/test-integration/test-cloning/tests/05_parallel-cloning.rs @@ -141,6 +141,7 @@ fn spawn_transfer_thread( }) } +#[ignore = "We are still evaluating escrow functionality that allows anything except just paying fees"] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_multiple_transfers_from_multiple_escrows_in_parallel() { init_logger!(); @@ -235,6 +236,7 @@ async fn test_multiple_transfers_from_multiple_escrows_in_parallel() { // that we can run multiple transactions in paralle. // We should move this test once we implement the proper parallel transaction // executor +#[ignore = "We are still evaluating escrow functionality that allows anything except just paying fees"] #[test] fn test_multiple_transfers_from_same_escrow_different_amounts_in_parallel() { init_logger!(); diff --git a/test-integration/test-cloning/tests/06_escrows.rs b/test-integration/test-cloning/tests/06_escrows.rs index 1f81a352f..88c8b9059 100644 --- a/test-integration/test-cloning/tests/06_escrows.rs +++ b/test-integration/test-cloning/tests/06_escrows.rs @@ -46,9 +46,23 @@ fn test_cloning_unescrowed_payer_that_is_escrowed_later() { &delegated_kp.pubkey(), LAMPORTS_PER_SOL / 2, ); - let (_sig, _found) = ctx + let (sig, _found) = ctx .send_and_confirm_instructions_with_payer_ephem(&[ix], &non_escrowed_kp) .unwrap(); + let tx = ctx + .get_transaction_ephem(&sig) + .expect("failed to fetch transaction ephem"); + let err = tx.transaction.meta.unwrap().err; + assert!( + err.is_some(), + "should fail since feepayer is not escrowed yet" + ); + debug!("Initial transaction error: {:#?}", err); + assert_eq!( + err.unwrap().to_string(), + "This account may not be used to pay transaction fees", + "unescrowed payer cannot be writable" + ); // When it completes we should see an empty escrow inside the validator let (escrow_pda, acc) = get_escrow_pda_ephem(&ctx, &non_escrowed_kp); diff --git a/test-integration/test-cloning/tests/07_subscription_limits.rs b/test-integration/test-cloning/tests/07_subscription_limits.rs new file mode 100644 index 000000000..67f53541f --- /dev/null +++ b/test-integration/test-cloning/tests/07_subscription_limits.rs @@ -0,0 +1,121 @@ +use std::{sync::Arc, time::Duration}; + +use integration_test_tools::IntegrationTestContext; +use log::*; +use solana_sdk::{ + native_token::LAMPORTS_PER_SOL, rent::Rent, signature::Keypair, + signer::Signer, +}; +use test_kit::init_logger; +use tokio::task::JoinSet; + +const NUM_PUBKEYS: usize = 400; +// Half of the accounts are delegated and aren't watched +const EXTRA_MONITORED_ACCOUNTS: usize = NUM_PUBKEYS / 2; +const AIRDROP_CHUNK_SIZE: usize = 100; +// See metrics config in: configs/cloning-conf.ephem.toml +const PORT: u16 = 9000; + +// This test creates a large number of accounts, airdrops to all of them +// and delegates half. +// It then ensures that the subscription count increased as expected. +// Since it will be affected by other tests that trigger subscriptions, +// we only run it in isolation manually. +#[ignore = "Run manually only"] +#[tokio::test(flavor = "multi_thread")] +async fn test_large_number_of_account_subscriptions() { + init_logger!(); + let ctx = Arc::new(IntegrationTestContext::try_new().unwrap()); + + debug!("Generating {NUM_PUBKEYS} keypairs..."); + let keypairs: Vec = + (0..NUM_PUBKEYS).map(|_| Keypair::new()).collect(); + debug!("✅ Generated {NUM_PUBKEYS} keypairs"); + + let rent_exempt_amount = Rent::default().minimum_balance(0); + debug!( + "Airdropping {rent_exempt_amount} lamports to {NUM_PUBKEYS} accounts in chunks of {AIRDROP_CHUNK_SIZE}..." + ); + + let payer_chain = Keypair::new(); + ctx.airdrop_chain(&payer_chain.pubkey(), LAMPORTS_PER_SOL * 10) + .expect("failed to airdrop to payer_chain"); + + let monitored_accounts_before = + ctx.get_monitored_accounts_count(PORT).unwrap(); + let mut total_processed = 0; + for (chunk_idx, chunk) in keypairs.chunks(AIRDROP_CHUNK_SIZE).enumerate() { + let mut join_set = JoinSet::new(); + for (idx, keypair) in chunk.iter().enumerate() { + let keypair = keypair.insecure_clone(); + let payer_chain = payer_chain.insecure_clone(); + let ctx = ctx.clone(); + join_set.spawn(async move { + if idx % 2 == 0 { + ctx.airdrop_chain_and_delegate( + &payer_chain, + &keypair, + rent_exempt_amount, + ) + .expect( + "failed to airdrop and delegate to on-chain account", + ); + } else { + ctx.airdrop_chain(&keypair.pubkey(), rent_exempt_amount) + .expect("failed to airdrop to on-chain account"); + } + }); + } + for _result in join_set.join_all().await { + // spawned task panicked or was cancelled - handled by join_all + } + total_processed += chunk.len(); + + let pubkeys = chunk.iter().map(|kp| kp.pubkey()).collect::>(); + + trace!( + "Pubkeys in chunk {}: {}", + chunk_idx + 1, + pubkeys + .iter() + .map(|k| k.to_string()) + .collect::>() + .join(", ") + ); + + debug!( + "✅ Airdropped batch {}: {}/{} accounts ({} total)", + chunk_idx + 1, + chunk.len(), + AIRDROP_CHUNK_SIZE, + total_processed + ); + + let _accounts = ctx + .fetch_ephem_multiple_accounts(&pubkeys) + .expect("failed to fetch accounts"); + + debug!( + "✅ Fetched batch {}: {}/{} accounts ({} total)", + chunk_idx + 1, + chunk.len(), + AIRDROP_CHUNK_SIZE, + total_processed + ); + } + + debug!("✅ Airdropped and fetched all {NUM_PUBKEYS} accounts from ephemeral RPC"); + + // Wait for metrics update + tokio::time::sleep(Duration::from_secs(5)).await; + + let monitored_accounts_after = + ctx.get_monitored_accounts_count(PORT).unwrap(); + let diff = monitored_accounts_after - monitored_accounts_before; + debug!("Monitored accounts count total: {monitored_accounts_after}, diff: {diff}"); + + assert_eq!( + diff, EXTRA_MONITORED_ACCOUNTS, + "Expected monitored accounts to increase by {EXTRA_MONITORED_ACCOUNTS}" + ); +} diff --git a/test-integration/test-config/tests/auto_airdrop_feepayer.rs b/test-integration/test-config/tests/auto_airdrop_feepayer.rs index 9bf018840..1bed43950 100644 --- a/test-integration/test-config/tests/auto_airdrop_feepayer.rs +++ b/test-integration/test-config/tests/auto_airdrop_feepayer.rs @@ -11,7 +11,6 @@ use magicblock_config::{ use solana_sdk::{signature::Keypair, signer::Signer, system_instruction}; use test_kit::init_logger; -#[ignore = "Auto airdrop is not generally supported at this point, we will add this back as needed"] #[test] fn test_auto_airdrop_feepayer_balance_after_tx() { init_logger!(); diff --git a/test-integration/test-schedule-intent/tests/test_schedule_intents.rs b/test-integration/test-schedule-intent/tests/test_schedule_intents.rs index d100658c4..801a69bcf 100644 --- a/test-integration/test-schedule-intent/tests/test_schedule_intents.rs +++ b/test-integration/test-schedule-intent/tests/test_schedule_intents.rs @@ -63,14 +63,10 @@ fn test_schedule_intent_and_undelegate() { schedule_intent(&ctx, &[&payer], Some(vec![-100])); // Assert that action after undelegate subtracted 100 from 101 - assert_counters( - &ctx, - &[ExpectedCounter { - pda: FlexiCounter::pda(&payer.pubkey()).0, - expected: 1, - }], - true, - ); + let pda = FlexiCounter::pda(&payer.pubkey()).0; + assert_counters(&ctx, &[ExpectedCounter { pda, expected: 1 }], true); + + verify_undelegation_in_ephem_via_owner(&[payer.pubkey()], &ctx); } #[test] @@ -129,6 +125,8 @@ fn test_schedule_intent_undelegate_delegate_back_undelegate_again() { true, ); + verify_undelegation_in_ephem_via_owner(&[payer.pubkey()], &ctx); + // Delegate back delegate_counter(&ctx, &payer); schedule_intent(&ctx, &[&payer], Some(vec![102])); @@ -191,6 +189,11 @@ fn test_2_payers_intent_with_undelegation() { true, ); debug!("✅ Verified counters on base layer"); + + verify_undelegation_in_ephem_via_owner( + &payers.iter().map(|p| p.pubkey()).collect::>(), + &ctx, + ); } #[test] @@ -236,6 +239,12 @@ fn test_1_payers_intent_with_undelegation() { true, ); debug!("✅ Verified counters on base layer"); + + verify_undelegation_in_ephem_via_owner( + &payers.iter().map(|p| p.pubkey()).collect::>(), + &ctx, + ); + debug!("✅ Verified undelegation via account owner"); } #[ignore = "With sdk having ShortAccountMetas instead of u8s we hit limited_deserialize here as instruction exceeds 1232 bytes"] @@ -447,3 +456,37 @@ fn schedule_intent( mutiplier * payers.len() as u64 * 1_000_000 ); } + +fn verify_undelegation_in_ephem_via_owner( + pubkeys: &[Pubkey], + ctx: &IntegrationTestContext, +) { + const RETRY_LIMIT: usize = 20; + let mut retries = 0; + + loop { + ctx.wait_for_next_slot_ephem().unwrap(); + let mut not_verified = vec![]; + for pk in pubkeys.iter() { + let counter_pda = FlexiCounter::pda(pk).0; + let owner = ctx.fetch_ephem_account_owner(counter_pda).unwrap(); + if owner == delegation_program_id() { + not_verified.push(*pk); + } + } + if not_verified.is_empty() { + break; + } + retries += 1; + if retries >= RETRY_LIMIT { + panic!( + "Failed to verify undelegation for pubkeys: {}", + not_verified + .iter() + .map(|k| k.to_string()) + .collect::>() + .join(", ") + ); + } + } +} diff --git a/test-integration/test-tools/Cargo.toml b/test-integration/test-tools/Cargo.toml index 0f9d4524c..75ea22b36 100644 --- a/test-integration/test-tools/Cargo.toml +++ b/test-integration/test-tools/Cargo.toml @@ -11,6 +11,8 @@ log = { workspace = true } random-port = { workspace = true } rayon = { workspace = true } serde = { workspace = true } +ureq = { workspace = true } +url = { workspace = true } magicblock-core = { workspace = true } magicblock-config = { workspace = true } magicblock-delegation-program = { workspace = true, features = [ diff --git a/test-integration/test-tools/src/integration_test_context.rs b/test-integration/test-tools/src/integration_test_context.rs index f31287102..48ae3a911 100644 --- a/test-integration/test-tools/src/integration_test_context.rs +++ b/test-integration/test-tools/src/integration_test_context.rs @@ -29,6 +29,7 @@ use solana_transaction_status::{ EncodedConfirmedBlock, EncodedConfirmedTransactionWithStatusMeta, UiTransactionEncoding, }; +use url::Url; use crate::{ dlp_interface, @@ -1148,4 +1149,57 @@ impl IntegrationTestContext { pub fn ws_url_chain() -> &'static str { WS_URL_CHAIN } + + // ----------------- + // Prometheus Metrics + // ----------------- + pub fn get_monitored_accounts_count(&self, port: u16) -> Result { + let ephem_url = self.try_ephem_client()?.url(); + let parsed_url = Url::parse(&ephem_url).map_err(|e| { + anyhow::anyhow!( + "Failed to parse ephemeral URL '{}': {}", + ephem_url, + e + ) + })?; + let host = parsed_url.host_str().ok_or_else(|| { + anyhow::anyhow!("No host found in ephemeral URL: {}", ephem_url) + })?; + let metrics_url = format!("http://{host}:{port}/metrics"); + let response = ureq::get(&metrics_url) + .call() + .map_err(|e| { + anyhow::anyhow!( + "Failed to fetch metrics from {}: {}", + metrics_url, + e + ) + })? + .into_string() + .map_err(|e| { + anyhow::anyhow!("Failed to read metrics response: {}", e) + })?; + + for line in response.lines() { + if line.starts_with("mbv_monitored_accounts ") { + let value_str = + line.split_whitespace().nth(1).ok_or_else(|| { + anyhow::anyhow!( + "Failed to parse monitored_accounts metric" + ) + })?; + return value_str.parse::().map_err(|e| { + anyhow::anyhow!( + "Failed to parse monitored_accounts value '{}': {}", + value_str, + e + ) + }); + } + } + + Err(anyhow::anyhow!( + "monitored_accounts metric not found in Prometheus response" + )) + } } diff --git a/test-kit/src/lib.rs b/test-kit/src/lib.rs index 7290f82f3..d9dbd1d81 100644 --- a/test-kit/src/lib.rs +++ b/test-kit/src/lib.rs @@ -84,6 +84,12 @@ impl ExecutionTestEnv { Self::new_with_fee(Self::BASE_FEE) } + pub fn new_with_payer_and_fees(payer: &Keypair, fee: u64) -> Self { + let mut ctx = Self::new_with_fee(fee); + ctx.payer = payer.insecure_clone(); + ctx + } + /// Creates a new, fully initialized validator test environment with given base fee /// /// This function sets up a complete validator stack: