From 7bfaff73e3c91c5d59419382eea8aa7b7afc4e55 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 19:55:05 +0200 Subject: [PATCH 01/48] Rename --- ...uance_eligibility_oracle_core.py => service_quality_oracle.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/models/{issuance_eligibility_oracle_core.py => service_quality_oracle.py} (100%) diff --git a/src/models/issuance_eligibility_oracle_core.py b/src/models/service_quality_oracle.py similarity index 100% rename from src/models/issuance_eligibility_oracle_core.py rename to src/models/service_quality_oracle.py From 84b2025f09e768c8e144d5ad5c1ea18f0ec84d0b Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 19:55:23 +0200 Subject: [PATCH 02/48] fix after rename --- pyproject.toml | 2 +- src/models/scheduler.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7ae4b5b..93daa9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" [tool.ruff.lint.per-file-ignores] # Ignore E402 (import not at top) in scripts and specific modules "scripts/test_*.py" = ["E402"] -"src/models/issuance_eligibility_oracle_core.py" = ["E402"] +"src/models/service_quality_oracle.py" = ["E402"] # Use unsafe fixes to address typing and other modernization issues [tool.ruff.lint.isort] diff --git a/src/models/scheduler.py b/src/models/scheduler.py index e0d5adc..ac29d48 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -8,6 +8,7 @@ import schedule from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential +import src.models.service_quality_oracle as oracle from src.models.issuance_data_access_helper import ( _setup_google_credentials_in_memory_from_env_var, ) @@ -91,8 +92,6 @@ def run_oracle(force_date=None): load_config() # Run the oracle - import src.models.issuance_eligibility_oracle_core as oracle - oracle.main() # Record successful run and overwrite the last run date From 9a3b6277fa02107f5d7e777b3159292a8511ba3a Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 19:55:35 +0200 Subject: [PATCH 03/48] update --- src/utils/slack_notifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/slack_notifier.py b/src/utils/slack_notifier.py index 8839255..9487407 100644 --- a/src/utils/slack_notifier.py +++ b/src/utils/slack_notifier.py @@ -45,7 +45,7 @@ def _send_message(self, payload: Dict) -> bool: # Log the message type logger.info(f"Sending Slack notification: {message_type}") - # Attempt to send the message 3 times with exponential backoff + # Attempt to send the message self.max_attempts times with exponential backoff for attempt in range(self.max_attempts): try: response = requests.post( From 448e4d3cf2cd9a1b6d96325d47d069bb7976f1c4 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 22:27:26 +0200 Subject: [PATCH 04/48] better spacing between functions --- src/utils/config_loader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index d8920f0..3a5b88e 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -41,6 +41,7 @@ def __init__(self, config_path: Optional[str] = None): self.config_path = config_path or self._get_default_config_path() self._env_var_pattern = re.compile(r"\$([A-Z_][A-Z0-9_]*)") + def _get_default_config_path(self) -> str: """Get the default configuration template path.""" # Check if we're in a Docker container @@ -58,6 +59,7 @@ def _get_default_config_path(self) -> str: raise ConfigurationError("Could not find config.toml in project root or Docker container") + # TODO: check this... def _substitute_env_vars(self, config_toml: Any) -> Any: """ @@ -97,6 +99,7 @@ def _substitute_env_vars(self, config_toml: Any) -> Any: else: return config_toml + def load_config(self) -> dict[str, Any]: """ Load configuration from config.toml and substitute environment variables. @@ -132,6 +135,7 @@ def load_config(self) -> dict[str, Any]: except Exception as e: raise ConfigurationError(f"Failed to substitute environment variables: {e}") from e + def validate_required_env_vars(self) -> None: """ Validate that all required environment variables are set without loading full config. @@ -159,6 +163,7 @@ def validate_required_env_vars(self) -> None: f"Missing required environment variables: {', '.join(sorted(set(missing_vars)))}" ) + def _collect_missing_env_vars(self, obj: Any) -> list[str]: """ Collect all missing environment variables from config object. @@ -186,6 +191,7 @@ def _collect_missing_env_vars(self, obj: Any) -> list[str]: # After all the missing variables have been collected, return the list return missing_vars + def get_flat_config(self) -> dict[str, Any]: """ Get configuration in flat format. @@ -222,6 +228,7 @@ def get_flat_config(self) -> dict[str, Any]: return flat_config + def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: """Parse RPC URLs from list format.""" if not rpc_urls: From c54f7ac402e48636428f80e59f922328cb82b4fe Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 22:34:34 +0200 Subject: [PATCH 05/48] refactor config_loader --- src/utils/config_loader.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index 3a5b88e..c490091 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -115,25 +115,22 @@ def load_config(self) -> dict[str, Any]: # Load the TOML configuration with open(self.config_path, "rb") as f: config = tomllib.load(f) - + logger.info(f"Loaded configuration from: {self.config_path}") - - except FileNotFoundError: - raise ConfigurationError(f"Configuration not found: {self.config_path}") from None - except Exception as e: - raise ConfigurationError(f"Failed to parse configuration: {e}") from e - - try: + # Substitute environment variables throughout the configuration config = self._substitute_env_vars(config) - + logger.info("Successfully loaded configuration with environment variables") return config - + + except FileNotFoundError as e: + raise ConfigurationError(f"Configuration not found: {self.config_path}") from e except ConfigurationError: raise except Exception as e: - raise ConfigurationError(f"Failed to substitute environment variables: {e}") from e + error_context = "parse configuration" if "tomllib" in str(e) else "substitute environment variables" + raise ConfigurationError(f"Failed to {error_context}: {e}") from e def validate_required_env_vars(self) -> None: From ba08fc1efcd3698c62ebe0d3b7252a2c32779e7b Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 22:45:48 +0200 Subject: [PATCH 06/48] Create retry_decorator.py --- src/utils/retry_decorator.py | 57 ++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 src/utils/retry_decorator.py diff --git a/src/utils/retry_decorator.py b/src/utils/retry_decorator.py new file mode 100644 index 0000000..b5a4887 --- /dev/null +++ b/src/utils/retry_decorator.py @@ -0,0 +1,57 @@ +""" +Standardized retry decorator with consistent backoff strategy across the application. +""" + +import logging +from functools import wraps +from typing import Any, Callable, Type, Union + +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, + before_sleep_log, +) + +logger = logging.getLogger(__name__) + + +def retry_with_backoff( + max_attempts: int = 5, + min_wait: int = 1, + max_wait: int = 120, + multiplier: int = 2, + exceptions: Union[Type[Exception], tuple[Type[Exception], ...]] = Exception, + reraise: bool = True, +) -> Callable: + """ + Retry decorator with exponential backoff. + + Args: + max_attempts: Maximum number of retry attempts (default: 5) + min_wait: Minimum wait time between retries in seconds (default: 1) + max_wait: Maximum wait time between retries in seconds (default: 120) + multiplier: Exponential backoff multiplier (default: 2) + exceptions: Exception types to retry on (default: Exception) + reraise: Whether to reraise the exception after all attempts fail (default: True) + + Returns: + Decorated function with retry logic + """ + def decorator(func: Callable) -> Callable: + """Retry decorator with exponential backoff.""" + @retry( + retry=retry_if_exception_type(exceptions), + stop=stop_after_attempt(max_attempts), + wait=wait_exponential(multiplier=multiplier, min=min_wait, max=max_wait), + before_sleep=before_sleep_log(logger, logging.WARNING), + reraise=reraise, + ) + @wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + return func(*args, **kwargs) + + return wrapper + + return decorator From dd680754c17ea4174ff86f6a0aa441733f169774 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 22:54:22 +0200 Subject: [PATCH 07/48] Update slack notifier to use retry decorator --- src/utils/slack_notifier.py | 86 +++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/src/utils/slack_notifier.py b/src/utils/slack_notifier.py index 9487407..a4df270 100644 --- a/src/utils/slack_notifier.py +++ b/src/utils/slack_notifier.py @@ -9,6 +9,8 @@ import requests +from src.utils.retry_decorator import retry_with_backoff + # Module-level logger logger = logging.getLogger(__name__) @@ -25,8 +27,14 @@ def __init__(self, webhook_url: str) -> None: """ self.webhook_url = webhook_url self.timeout = 10 # seconds - self.max_attempts = 8 + + @retry_with_backoff( + max_attempts=8, + min_wait=1, + max_wait=128, + exceptions=(requests.exceptions.RequestException,) + ) def _send_message(self, payload: Dict) -> bool: """ Send a message to Slack via webhook with exponential backoff retry. @@ -37,50 +45,37 @@ def _send_message(self, payload: Dict) -> bool: Returns: bool: True if message was sent successfully, False otherwise """ - import time - # Get the message type from the payload message_type = payload.get("text", "Unknown") # Log the message type logger.info(f"Sending Slack notification: {message_type}") - # Attempt to send the message self.max_attempts times with exponential backoff - for attempt in range(self.max_attempts): - try: - response = requests.post( - self.webhook_url, - json=payload, - timeout=self.timeout, - headers={"Content-Type": "application/json"}, - ) - - # If the message is sent successfully, return True - if response.status_code == 200: - logger.info("Slack notification sent successfully") - return True - + try: + response = requests.post( + self.webhook_url, + json=payload, + timeout=self.timeout, + headers={"Content-Type": "application/json"}, + ) + + # If the message is sent successfully, return True + if response.status_code == 200: + logger.info("Slack notification sent successfully") + return True + else: # log message failure - else: - logger.warning(f"Slack notification failed: {response.status_code}") - - # If there is an error when trying to send the message, log the error - except requests.exceptions.RequestException as e: - logger.warning(f"Slack notification attempt {attempt + 1} failed: {str(e)}") + logger.warning(f"Slack notification failed: {response.status_code}") + # Raise an exception to trigger retry + response.raise_for_status() + return False - # If the last attempt fails, log an error - if attempt == self.max_attempts - 1: - logger.error("All Slack notification attempts failed") + # If there is an error when trying to send the message, log the error and re-raise + except requests.exceptions.RequestException as e: + logger.warning(f"Slack notification failed: {str(e)}") + raise - # If the attempt is not the last, wait for the exponential backoff and retry - else: - # Exponential backoff: 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s - wait_time = 2**attempt - logger.info(f"Retrying in {wait_time} seconds...") - time.sleep(wait_time) - # If the message is not sent successfully, return False - return False def _create_payload(self, text: str, fields: List[Dict], color: str = "good") -> Dict: """Create a Slack message payload.""" @@ -96,6 +91,7 @@ def _create_payload(self, text: str, fields: List[Dict], color: str = "good") -> ], } + def send_success_notification( self, eligible_indexers: List[str], @@ -144,7 +140,12 @@ def send_success_notification( payload = self._create_payload("Service Quality Oracle - Success", fields, "good") # Send message payload to Slack - return self._send_message(payload) + try: + return self._send_message(payload) + except Exception as e: + logger.error(f"Failed to send success notification: {e}") + return False + def send_failure_notification( self, @@ -197,7 +198,12 @@ def send_failure_notification( payload = self._create_payload("Service Quality Oracle - FAILURE", fields, "danger") # Send message payload to Slack - return self._send_message(payload) + try: + return self._send_message(payload) + except Exception as e: + logger.error(f"Failed to send failure notification: {e}") + return False + def send_info_notification(self, message: str, title: str = "Info") -> bool: """ @@ -222,7 +228,11 @@ def send_info_notification(self, message: str, title: str = "Info") -> bool: payload = self._create_payload(f"Service Quality Oracle - {title}", fields) # Send message payload to Slack - return self._send_message(payload) + try: + return self._send_message(payload) + except Exception as e: + logger.error(f"Failed to send info notification: {e}") + return False def create_slack_notifier(webhook_url: Optional[str]) -> Optional[SlackNotifier]: From 91e2add9a995737f01ca5b3af533d736dbdfbcc1 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 22:55:33 +0200 Subject: [PATCH 08/48] better spacing --- src/utils/config_loader.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index c490091..893bc37 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -204,18 +204,23 @@ def get_flat_config(self) -> dict[str, Any]: "bigquery_location": config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), "bigquery_project_id": config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), "bigquery_dataset_id": config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), + # Blockchain settings "contract_address": config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), "contract_function": config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), "chain_id": config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), "rpc_providers": self._parse_rpc_urls(config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), + # Scheduling "scheduled_run_time": config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), + # Subgraph URLs "subgraph_url": config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), + # Processing settings "batch_size": config.get("processing", {}).get("BATCH_SIZE", 125), "max_age_before_deletion": config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), + # Secrets "google_application_credentials": config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), "private_key": config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), From 174b8d2746e4c5dd6ce6b40b17f32151c623659c Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:22:52 +0200 Subject: [PATCH 09/48] Move _validate_required_fields to config_manager --- src/utils/config_manager.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/utils/config_manager.py diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py new file mode 100644 index 0000000..38175c1 --- /dev/null +++ b/src/utils/config_manager.py @@ -0,0 +1,42 @@ +""" +Centralized configuration manager with validation and credential handling. +""" + +import json +import logging +import os +from datetime import datetime +from pathlib import Path +from typing import Any + +from src.utils.config_loader import ConfigLoader, ConfigurationError +from src.utils.retry_decorator import retry_with_backoff + +logger = logging.getLogger(__name__) + + +class ConfigManager: + """Centralized configuration manager with validation and credential handling.""" + + def __init__(self): + self._config = None + + + def _validate_required_fields(self, data: dict, required_fields: list[str], context: str) -> None: + """ + Helper function to validate required fields are present in a dictionary. + + Args: + data: Dictionary to validate + required_fields: List of required fields + context: Context for error message + + Raises: + ValueError: If required fields are missing + """ + # Validate that all required fields are present in the data + missing_fields = [field for field in required_fields if field not in data] + + # If any required fields are missing, raise an error + if missing_fields: + raise ValueError(f"{context}: missing {missing_fields}") From 0f276cecafc9fde8f2c7363ee56e78460609991a Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:24:33 +0200 Subject: [PATCH 10/48] move _load_config_and_return_validated to config manager module as load_and_validate_config --- src/utils/config_manager.py | 75 +++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index 38175c1..deab4f3 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -40,3 +40,78 @@ def _validate_required_fields(self, data: dict, required_fields: list[str], cont # If any required fields are missing, raise an error if missing_fields: raise ValueError(f"{context}: missing {missing_fields}") + + + def load_and_validate_config(self) -> dict[str, Any]: + """ + Load all necessary configurations using config loader, validate, and return them. + This function is called once at startup to load the configuration. + + Returns: + Dict[str, Any]: Config dictionary with validated and converted values. + { + "bigquery_project_id": str, + "bigquery_location": str, + "rpc_providers": list[str], + "contract_address": str, + "contract_function": str, + "chain_id": int, + "scheduled_run_time": str, + "batch_size": int, + "max_age_before_deletion": int, + } + Raises: + ConfigurationError: If configuration loading fails + ValueError: If configuration validation fails + """ + # If the configuration has already been loaded, return it + if self._config is not None: + return self._config + + try: + # Load configuration using config loader + loader = ConfigLoader() + config = loader.get_flat_config() + logger.info("Successfully loaded configuration") + + # Validate and convert chain_id to integer + if config.get("chain_id"): + try: + config["chain_id"] = int(config["chain_id"]) + except ValueError as e: + raise ValueError(f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer.") from e + + # Validate scheduled run time format (HH:MM) + if config.get("scheduled_run_time"): + try: + datetime.strptime(config["scheduled_run_time"], "%H:%M") + except ValueError as e: + raise ValueError( + f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - " + "must be in HH:MM format" + ) from e + + # Validate blockchain configuration contains all required fields + required_fields = [ + "private_key", + "contract_address", + "contract_function", + "chain_id", + "scheduled_run_time", + ] + self._validate_required_fields(config, required_fields, "Missing required blockchain configuration") + + # Validate RPC providers + if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): + raise ValueError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") + + # Set the configuration in the class & return it + self._config = config + return config + + except ConfigurationError: + raise + except Exception as e: + raise ConfigurationError(f"Configuration validation failed: {e}") from e + + From b89b27182aee471c276a9a578273a280443a2924 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:25:44 +0200 Subject: [PATCH 11/48] move _get_path_to_project_root to config_manager as get_project_root --- src/utils/config_manager.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index deab4f3..cd442d8 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -115,3 +115,25 @@ def load_and_validate_config(self) -> dict[str, Any]: raise ConfigurationError(f"Configuration validation failed: {e}") from e + @staticmethod + def get_project_root() -> Path: + """ + Get the path to the project root directory. + In Docker environments, use /app. Otherwise, find by marker files. + """ + # Use the /app directory as the project root if it exists + docker_path = Path("/app") + if docker_path.exists(): + return docker_path + + # If the /app directory doesn't exist fall back to marker files + current_path = Path(__file__).parent + while current_path != current_path.parent: + if (current_path / ".gitignore").exists() or (current_path / "pyproject.toml").exists(): + logger.info(f"Found project root at: {current_path}") + return current_path + # Attempt to traverse upwards (will not work if the directory has no parent) + current_path = current_path.parent + + # If we got here, something is wrong + raise FileNotFoundError("Could not find project root directory. Investigate.") From 86306f72dde937d13f3293b953ccc83cd7c68550 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:30:00 +0200 Subject: [PATCH 12/48] move remaining configuration and setup functions to config_manager --- src/utils/config_manager.py | 207 ++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index cd442d8..21f29d5 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -137,3 +137,210 @@ def get_project_root() -> Path: # If we got here, something is wrong raise FileNotFoundError("Could not find project root directory. Investigate.") + + +class CredentialManager: + """Handles credential management for Google Cloud services.""" + + def __init__(self): + pass + + + def _validate_required_fields(self, data: dict, required_fields: list[str], context: str) -> None: + """ + Helper function to validate required fields are present in a dictionary. + + Args: + data: Dictionary to validate + required_fields: List of required fields + context: Context for error message + + Raises: + ValueError: If required fields are missing + """ + # Validate that all required fields are present in the data + missing_fields = [field for field in required_fields if field not in data] + + # If any required fields are missing, raise an error + if missing_fields: + raise ValueError(f"{context}: missing {missing_fields}") + + + def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: + """ + Parse and validate Google credentials JSON from environment variable. + + Args: + creds_env: JSON string containing credentials + + Returns: + dict: Parsed and validated credentials data + + Raises: + ValueError: If JSON is invalid or credentials are incomplete + """ + # Try to parse the credentials + try: + # Parse the credentials + creds_data = json.loads(creds_env) + cred_type = creds_data.get("type", "") + + # Validate the credentials data based on the type + if cred_type == "authorized_user": + required_fields = ["client_id", "client_secret", "refresh_token"] + self._validate_required_fields( + creds_data, required_fields, "Incomplete authorized_user credentials" + ) + + elif cred_type == "service_account": + required_fields = ["private_key", "client_email", "project_id"] + self._validate_required_fields( + creds_data, required_fields, "Incomplete service_account credentials" + ) + + else: + raise ValueError( + f"Unsupported credential type: '{cred_type}'. Expected 'authorized_user' or 'service_account'" + ) + + # If the credentials parsing fails, raise an error + except Exception as e: + logger.error(f"Failed to parse and validate credentials JSON: {e}") + raise ValueError(f"Invalid credentials JSON: {e}") from e + + # Return the parsed credentials + return creds_data + + + def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: + """Set up user account credentials directly in memory.""" + import google.auth + from google.oauth2.credentials import Credentials + + # Try to set up the credentials + try: + credentials = Credentials( + token=None, + refresh_token=creds_data.get("refresh_token"), + client_id=creds_data.get("client_id"), + client_secret=creds_data.get("client_secret"), + token_uri="https://oauth2.googleapis.com/token", + ) + + # Set credentials globally for GCP libraries + google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] + logger.info("Successfully loaded user account credentials from environment variable") + + # Clear credentials from memory + finally: + if "creds_data" in locals(): + creds_data.clear() + + + def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None: + """Set up service account credentials directly in memory.""" + import google.auth + from google.oauth2 import service_account + + # Try to set up the credentials + try: + # Create credentials object directly from dict + credentials = service_account.Credentials.from_service_account_info( + creds_data + ) + + # Set credentials globally for GCP libraries + google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] + logger.info("Successfully loaded service account credentials from environment variable") + + # If the credentials creation fails, raise an error + except Exception as e: + logger.error(f"Failed to create service account credentials: {e}") + raise ValueError(f"Invalid service account credentials: {e}") from e + + # Clear the original credentials dict from memory if it exists + finally: + if "creds_data" in locals(): + creds_data.clear() + + + @retry_with_backoff(max_attempts=3, exceptions=(ValueError,)) + def setup_google_credentials(self) -> None: + """ + Set up Google credentials directly in memory from environment variable. + This function handles multiple credential formats securely: + 1. JSON string in GOOGLE_APPLICATION_CREDENTIALS (inline credentials) + 2. File path in GOOGLE_APPLICATION_CREDENTIALS + 3. Automatic fallback to gcloud CLI authentication + """ + # Get the account credentials from the environment variable + creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + + # If the credentials are not set, log a warning and return + if not creds_env: + logger.warning( + "GOOGLE_APPLICATION_CREDENTIALS not set. Falling back to gcloud CLI user credentials if available" + ) + return + + # Case 1: JSON credentials provided inline + if creds_env.strip().startswith("{"): + creds_data = None + try: + # Parse and validate credentials + creds_data = self._parse_and_validate_credentials_json(creds_env) + cred_type = creds_data.get("type") + + # Set up credentials based on type + if cred_type == "authorized_user": + self._setup_user_credentials_in_memory(creds_data.copy()) + elif cred_type == "service_account": + self._setup_service_account_credentials_in_memory(creds_data.copy()) + + except Exception as e: + logger.error("Failed to set up credentials from environment variable") + raise ValueError(f"Error processing inline credentials: {e}") from e + finally: + if creds_data is not None: + creds_data.clear() + + # Case 2: File path provided + elif os.path.exists(creds_env): + logger.info(f"Using credentials file: {creds_env}") + + # Case 3: Invalid format + else: + logger.warning( + f"GOOGLE_APPLICATION_CREDENTIALS appears to be neither valid JSON nor existing file path: {creds_env[:50]}..." + ) + logger.warning("Falling back to gcloud CLI authentication if available") + + + def validate_google_credentials(self) -> bool: + """ + Validate that Google credentials are properly configured and working. + + Returns: + bool: True if credentials are valid and working + """ + try: + import google.auth + + # Try to get default credentials + credentials, project = google.auth.default() + + if credentials: + logger.info(f"Google credentials validated successfully for project: {project}") + return True + else: + logger.error("No valid Google credentials found") + return False + + except Exception as e: + logger.error(f"Google credentials validation failed: {e}") + return False + + +# Global instances for easy access +config_manager = ConfigManager() +credential_manager = CredentialManager() \ No newline at end of file From 6deab53777dd3b1a07201cf8c82d17086709162d Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:43:51 +0200 Subject: [PATCH 13/48] Move function to data processor module Move _export_bigquery_data_as_csvs_and_return_lists_of_ineligible_and_eligible_indexers to data processor module as export_bigquery_data_as_csvs_and_return_indexer_lists --- src/models/data_processor.py | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 src/models/data_processor.py diff --git a/src/models/data_processor.py b/src/models/data_processor.py new file mode 100644 index 0000000..0c7a2bf --- /dev/null +++ b/src/models/data_processor.py @@ -0,0 +1,73 @@ +""" +Data processing utility module for Service Quality Oracle. + +This module handles data processing operations including: +- CSV export and file management +- Data cleaning and directory maintenance +- Indexer data filtering and organization +""" + +import logging +import shutil +from datetime import date, datetime +from pathlib import Path +from typing import List, Tuple + +import pandas as pd + +logger = logging.getLogger(__name__) + + +class DataProcessor: + """Handles data processing and file management operations.""" + + def __init__(self, project_root: Path): + """ + Initialize the data processor. + + Args: + project_root: Path to project root directory + """ + # Set the project root and output directory + self.project_root = project_root + self.output_dir = project_root / "data" / "output" + + + def export_bigquery_data_as_csvs_and_return_indexer_lists( + self, input_data_from_bigquery: pd.DataFrame, output_date_dir: Path + ) -> Tuple[List[str], List[str]]: + """ + Export BigQuery data as CSVs and return lists of eligible/ineligible indexers. + + Args: + input_data_from_bigquery: Indexer data returned from BigQuery + output_date_dir: Path to date directory for output files + + Returns: + Tuple[List[str], List[str]]: Two lists of indexer addresses, eligible and ineligible + """ + # Ensure the output directory exists, creating parent directories if necessary + output_date_dir.mkdir(exist_ok=True, parents=True) + + # Save raw data for internal use + raw_data_path = output_date_dir / "indexer_issuance_eligibility_data.csv" + input_data_from_bigquery.to_csv(raw_data_path, index=False) + logger.info(f"Saved raw bigquery results df to: {raw_data_path}") + + # Filter eligible and ineligible indexers + eligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 1] + ineligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 0] + + # Save filtered data + eligible_path = output_date_dir / "eligible_indexers.csv" + ineligible_path = output_date_dir / "ineligible_indexers.csv" + + eligible_df[["indexer"]].to_csv(eligible_path, index=False) + ineligible_df[["indexer"]].to_csv(ineligible_path, index=False) + + logger.info(f"Saved {len(eligible_df)} eligible indexers to: {eligible_path}") + logger.info(f"Saved {len(ineligible_df)} ineligible indexers to: {ineligible_path}") + + # Return lists of eligible and ineligible indexers + return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() + From 61459d36e11c7dec991a61e807dd235e2f09ca5e Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:45:20 +0200 Subject: [PATCH 14/48] Move another function to data processor module Move _clean_old_date_directories to data processor module as clean_old_date_directories --- src/models/data_processor.py | 62 ++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/src/models/data_processor.py b/src/models/data_processor.py index 0c7a2bf..126ad30 100644 --- a/src/models/data_processor.py +++ b/src/models/data_processor.py @@ -71,3 +71,65 @@ def export_bigquery_data_as_csvs_and_return_indexer_lists( # Return lists of eligible and ineligible indexers return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() + + def clean_old_date_directories(self, max_age_before_deletion: int) -> None: + """ + Remove old date directories to prevent unlimited growth. + + Args: + max_age_before_deletion: Maximum age in days before deleting data output + """ + today = date.today() + + # Check if the output directory exists + if not self.output_dir.exists(): + logger.warning(f"Output directory does not exist: {self.output_dir}") + return + + directories_removed = 0 + + # Only process directories with date format YYYY-MM-DD + for item in self.output_dir.iterdir(): + if not item.is_dir(): + continue + + try: + # Try to parse the directory name as a date + dir_date = datetime.strptime(item.name, "%Y-%m-%d").date() + age_days = (today - dir_date).days + + # Remove if older than max_age_before_deletion + if age_days > max_age_before_deletion: + logger.info(f"Removing old data directory: {item} ({age_days} days old)") + shutil.rmtree(item) + directories_removed += 1 + + except ValueError: + # Skip directories that don't match date format + logger.debug(f"Skipping non-date directory: {item.name}") + continue + + if directories_removed > 0: + logger.info(f"Removed {directories_removed} old data directories") + else: + logger.info("No old data directories found to remove") + + + def get_date_output_directory(self, current_date: date) -> Path: + """ + Get the output directory path for a specific date. + + Args: + current_date: Date for which to get the output directory + + Returns: + Path: Path to the date-specific output directory + """ + return self.output_dir / current_date.strftime("%Y-%m-%d") + + + def ensure_output_directory_exists(self) -> None: + """Ensure the main output directory exists.""" + # Create the output directory if it doesn't exist + self.output_dir.mkdir(exist_ok=True, parents=True) + logger.debug(f"Ensured output directory exists: {self.output_dir}") From 1f59819dfd4aba46db8b6a02fbb769ae6e3223f6 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Mon, 9 Jun 2025 23:45:33 +0200 Subject: [PATCH 15/48] add functions to data processor module --- src/models/data_processor.py | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/models/data_processor.py b/src/models/data_processor.py index 126ad30..c561639 100644 --- a/src/models/data_processor.py +++ b/src/models/data_processor.py @@ -133,3 +133,62 @@ def ensure_output_directory_exists(self) -> None: # Create the output directory if it doesn't exist self.output_dir.mkdir(exist_ok=True, parents=True) logger.debug(f"Ensured output directory exists: {self.output_dir}") + + + def validate_dataframe_structure(self, df: pd.DataFrame, required_columns: List[str]) -> bool: + """ + Validate that a DataFrame has the required columns. + + Args: + df: DataFrame to validate + required_columns: List of required column names + + Returns: + bool: True if all required columns are present + + Raises: + ValueError: If required columns are missing + """ + # Check if any required columns are missing + missing_columns = [col for col in required_columns if col not in df.columns] + + # If any required columns are missing, raise an error + if missing_columns: + raise ValueError(f"DataFrame missing required columns: {missing_columns}") + + # If all required columns are present, return True + return True + + + def get_directory_size_info(self) -> dict: + """ + Get information about the output directory size and file counts. + + Returns: + dict: Information about directory size and contents + """ + # If the directory doesn't exist, return a dictionary with 0 values + if not self.output_dir.exists(): + return {"exists": False, "total_size_bytes": 0, "directory_count": 0, "file_count": 0} + + total_size = 0 + file_count = 0 + directory_count = 0 + + # Get the total size of the directory and the number of files and directories + for item in self.output_dir.rglob("*"): + if item.is_file(): + total_size += item.stat().st_size + file_count += 1 + elif item.is_dir(): + directory_count += 1 + + # Return the information about the directory size and contents + return { + "exists": True, + "total_size_bytes": total_size, + "total_size_mb": round(total_size / (1024 * 1024), 2), + "directory_count": directory_count, + "file_count": file_count, + "path": str(self.output_dir), + } From c16d00a99e988c43e5aa40cd2116fc56fa5385cb Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:16:17 +0200 Subject: [PATCH 16/48] Move blockchain interaction functions to dedicated blockchain client module --- src/models/blockchain_client.py | 650 ++++++++++++++++++++++++++++++++ 1 file changed, 650 insertions(+) create mode 100644 src/models/blockchain_client.py diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py new file mode 100644 index 0000000..aceaa50 --- /dev/null +++ b/src/models/blockchain_client.py @@ -0,0 +1,650 @@ +""" +Blockchain client for Service Quality Oracle. + +This module handles all blockchain interactions including: +- Contract ABI loading +- RPC provider connections with failover +- Transaction building, signing, and sending +- Gas estimation and nonce management +""" + +import json +import logging +from pathlib import Path +from typing import Any, Callable, Dict, List, Tuple + +from web3 import Web3 +from web3.contract import Contract + +from src.utils.key_validator import KeyValidationError, validate_and_format_private_key +from src.utils.retry_decorator import retry_with_backoff + +logger = logging.getLogger(__name__) + + +class BlockchainClient: + """Handles all blockchain interactions""" + + def __init__(self, rpc_providers: List[str], contract_address: str, project_root: Path): + """ + Initialize the blockchain client. + + Args: + rpc_providers: List of RPC provider URLs + contract_address: Smart contract address + project_root: Path to project root directory + """ + self.rpc_providers = rpc_providers + self.contract_address = contract_address + self.project_root = project_root + self.contract_abi = self._load_contract_abi() + + + def _load_contract_abi(self) -> List[Dict]: + """Load the contract ABI from the contracts directory.""" + # Try to load the ABI file + try: + abi_path = self.project_root / "contracts" / "contract.abi.json" + with open(abi_path) as f: + return json.load(f) + + # If the ABI file cannot be loaded, raise an error + except Exception as e: + logger.error(f"Failed to load contract ABI: {str(e)}") + raise + + + @retry_with_backoff(max_attempts=3, exceptions=(ConnectionError,)) + def _get_working_web3_connection( + self, rpc_providers: List[str], contract_address: str, contract_abi: List[Dict] + ) -> Tuple[Web3, Contract, str]: + """ + Try connecting to RPC providers until one works. + + Args: + rpc_providers: List of RPC provider URLs to try connecting to + contract_address: Contract address for creating contract instance + contract_abi: Contract ABI for creating contract instance + + Returns: + Tuple[Web3, Contract, str]: Working web3 instance, contract instance, and provider URL + + Raises: + ConnectionError: If all RPC providers fail + """ + # Try to connect to each RPC provider in sequence + for i, rpc_url in enumerate(rpc_providers): + try: + provider_type = "primary" if i == 0 else f"backup #{i}" + logger.info(f"Attempting to connect to {provider_type} RPC provider: {rpc_url}") + w3 = Web3(Web3.HTTPProvider(rpc_url)) + + # Test connection + if w3.is_connected(): + logger.info(f"Successfully connected to {provider_type} RPC provider") + # Create contract instance and return web3 instance, contract instance, and provider URL + contract = w3.eth.contract( + address=Web3.to_checksum_address(contract_address), abi=contract_abi + ) + + # + return w3, contract, rpc_url + + # If we could not connect log the error + else: + logger.warning(f"Could not connect to {provider_type} RPC provider: {rpc_url}") + + # If we get an error, log the error + except Exception as e: + provider_type = "primary" if i == 0 else f"backup #{i}" + logger.warning(f"Error connecting to {provider_type} RPC provider {rpc_url}: {str(e)}") + + # If we get here, all providers failed + raise ConnectionError(f"Failed to connect to any of {len(rpc_providers)} RPC providers: {rpc_providers}") + + + def _setup_transaction_account(self, private_key: str, w3: Web3) -> str: + """ + Get the address of the account from the private key. + + Args: + private_key: Private key for the account + w3: Web3 instance + + Returns: + str: Address of the account + """ + try: + account = w3.eth.account.from_key(private_key) + logger.info(f"Using account: {account.address}") + return account.address + + # If the account cannot be retrieved, log the error and raise an exception + except Exception as e: + logger.error(f"Failed to retrieve account from private key: {str(e)}") + raise + + + def _estimate_transaction_gas( + self, w3: Web3, contract_func: Any, indexer_addresses: List[str], + data_bytes: bytes, sender_address: str + ) -> int: + """ + Estimate gas for the transaction with 25% buffer. + + Args: + w3: Web3 instance + contract_func: Contract function to call + indexer_addresses: List of indexer addresses + data_bytes: Data bytes for the transaction + sender_address: Transaction sender address + + Returns: + int: Estimated gas with 25% buffer + """ + # Try to estimate the gas for the transaction + try: + estimated_gas = contract_func(indexer_addresses, data_bytes).estimate_gas({"from": sender_address}) + gas_limit = int(estimated_gas * 1.25) # 25% buffer + logger.info(f"Estimated gas: {estimated_gas}, with buffer: {gas_limit}") + return gas_limit + + # If the gas estimation fails, log the error and raise an exception + except Exception as e: + logger.error(f"Gas estimation failed: {str(e)}") + raise + + + def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: bool) -> int: + """ + Determine the appropriate nonce for the transaction. + + Args: + w3: Web3 instance + sender_address: Transaction sender address + replace: Whether to replace pending transactions + + Returns: + int: Transaction nonce to use + """ + # If we are not replacing a pending transaction, use the next available nonce + if not replace: + nonce = w3.eth.get_transaction_count(sender_address) + logger.info(f"Using next available nonce: {nonce}") + return nonce + + # If we are replacing a pending transaction, try to find and replace it + logger.info("Attempting to find and replace a pending transaction") + + # Try to find pending transactions + try: + pending_txs = w3.eth.get_block("pending", full_transactions=True) + sender_pending_txs = [ + tx for tx in pending_txs.transactions + if hasattr(tx, "from") and tx["from"] == sender_address + ] + + # If we found pending transactions, use the nonce of the first pending transaction + if sender_pending_txs: + sender_pending_txs.sort(key=lambda x: x["nonce"]) + nonce = sender_pending_txs[0]["nonce"] + logger.info(f"Found pending transaction with nonce {nonce} for replacement") + return nonce + + # If we could not find pending transactions log the issue + except Exception as e: + logger.warning(f"Could not check pending transactions: {str(e)}") + + # Check for nonce gaps + try: + current_nonce = w3.eth.get_transaction_count(sender_address, "pending") + latest_nonce = w3.eth.get_transaction_count(sender_address, "latest") + if current_nonce > latest_nonce: + logger.info(f"Detected nonce gap: latest={latest_nonce}, pending={current_nonce}") + return latest_nonce + + # If we could not check nonce gaps log the issue + except Exception as e: + logger.warning(f"Could not check nonce gap: {str(e)}") + + # Fallback to next available nonce + nonce = w3.eth.get_transaction_count(sender_address) + logger.info(f"Using next available nonce: {nonce}") + return nonce + + + def _get_gas_prices(self, w3: Web3, replace: bool) -> Tuple[int, int]: + """Get base fee and max priority fee for transaction.""" + # Get current gas prices with detailed logging + try: + latest_block = w3.eth.get_block("latest") + base_fee = latest_block["baseFeePerGas"] + logger.info(f"Latest block base fee: {base_fee/1e9:.2f} gwei") + + # If the base fee cannot be retrieved, use a fallback value + except Exception as e: + logger.warning(f"Could not get base fee: {e}") + base_fee = w3.to_wei(10, "gwei") + + # Try to get the max priority fee + try: + max_priority_fee = w3.eth.max_priority_fee + logger.info(f"Max priority fee: {max_priority_fee/1e9:.2f} gwei") + + # If the max priority fee cannot be retrieved, use a fallback value + except Exception as e: + logger.warning(f"Could not get max priority fee: {e}") + max_priority_fee = w3.to_wei(2, "gwei") # fallback + + # Return the base fee and max priority fee + return base_fee, max_priority_fee + + + def _build_transaction_params( + self, + sender_address: str, + nonce: int, + chain_id: int, + gas_limit: int, + base_fee: int, + max_priority_fee: int, + replace: bool, + ) -> Dict: + """Build transaction parameters with appropriate gas prices.""" + tx_params = {"from": sender_address, "nonce": nonce, "chainId": chain_id, "gas": gas_limit} + + # Set gas prices (higher for replacement transactions) + if replace: + max_fee_per_gas = base_fee * 4 + max_priority_fee * 2 + max_priority_fee_per_gas = max_priority_fee * 2 + tx_params["maxFeePerGas"] = max_fee_per_gas + tx_params["maxPriorityFeePerGas"] = max_priority_fee_per_gas + logger.info(f"High gas for replacement: {max_fee_per_gas/1e9:.2f} gwei") + + # If we are not replacing a pending transaction, use a lower gas price + else: + max_fee_per_gas = base_fee * 2 + max_priority_fee + max_priority_fee_per_gas = max_priority_fee + tx_params["maxFeePerGas"] = max_fee_per_gas + tx_params["maxPriorityFeePerGas"] = max_priority_fee_per_gas + logger.info(f"Standard gas: {max_fee_per_gas/1e9:.2f} gwei") + + logger.info(f"Transaction parameters: nonce={nonce}, gas={gas_limit}, chain_id={chain_id}") + return tx_params + + + def _build_and_sign_transaction( + self, w3: Web3, contract_func: Any, indexer_addresses: List[str], + data_bytes: bytes, tx_params: Dict, private_key: str + ): + """Build and sign the transaction.""" + # Attempt to build the transaction + try: + transaction = contract_func(indexer_addresses, data_bytes).build_transaction(tx_params) + logger.info("Transaction built successfully") + + # If the transaction cannot be built, log the error and raise an exception + except Exception as e: + logger.error(f"Failed to build transaction: {e}") + logger.error(f"Contract function: {contract_func}") + logger.error(f"Indexer addresses count: {len(indexer_addresses)}") + logger.error(f"Data bytes length: {len(data_bytes)}") + logger.error(f"Transaction params: {tx_params}") + raise + + # Attempt to sign the transaction + try: + signed_tx = w3.eth.account.sign_transaction(transaction, private_key) + logger.info("Transaction signed successfully") + return signed_tx + + # If the transaction cannot be signed, log the error and raise an exception + except Exception as e: + logger.error(f"Failed to sign transaction: {e}") + raise + + + def _handle_transaction_error(self, error_msg: str) -> None: + """Handle and log specific transaction error types.""" + # If the error message contains "insufficient funds", log the error + if "insufficient funds" in error_msg.lower(): + logger.error("Insufficient funds to pay for gas") + + # If the error message contains "nonce too low", log the error + elif "nonce too low" in error_msg.lower(): + logger.error("Nonce is too low - transaction may have already been sent") + + # If the error message contains "nonce too high", log the error + elif "nonce too high" in error_msg.lower(): + logger.error("Nonce is too high - there may be pending transactions") + + # If the error message contains "gas", log the error + elif "gas" in error_msg.lower(): + logger.error("Gas-related issue - transaction may consume too much gas") + + # If the error message contains "400", log the error + elif "400" in error_msg: + logger.error("HTTP 400 Bad Request - RPC provider rejected the request") + + + def _send_signed_transaction(self, w3: Web3, signed_tx: Any) -> str: + """Send the signed transaction and handle errors.""" + # Attempt to send the transaction to the network + try: + tx_hash = w3.eth.send_raw_transaction(signed_tx.rawTransaction) + logger.info(f"Transaction sent! Hash: {tx_hash.hex()}") + return tx_hash.hex() + + # If the transaction could not be sent, log the error and raise an exception + except ValueError as e: + error_msg = str(e) + logger.error(f"Transaction rejected by network: {error_msg}") + self._handle_transaction_error(error_msg) + raise + + # If we get an unexpected error, log the error and raise an exception + except Exception as e: + logger.error(f"Unexpected error sending transaction: {e}") + logger.error(f"Error type: {type(e).__name__}") + raise + + + def _build_and_send_transaction( + self, + w3: Web3, + contract_func: Any, + indexer_addresses: List[str], + data_bytes: bytes, + sender_address: str, + private_key: str, + chain_id: int, + gas_limit: int, + nonce: int, + replace: bool, + ) -> str: + """ + Build, sign, and send the transaction. + + Args: + w3: Web3 instance + contract_func: Contract function to call + indexer_addresses: List of indexer addresses + data_bytes: Data bytes for transaction + sender_address: Transaction sender address + private_key: Private key for signing + chain_id: Chain ID + gas_limit: Gas limit for transaction + nonce: Transaction nonce + replace: Whether this is a replacement transaction + + Returns: + str: Transaction hash + """ + try: + # Get gas prices + base_fee, max_priority_fee = self._get_gas_prices(w3, replace) + + # Build transaction parameters + tx_params = self._build_transaction_params( + sender_address, nonce, chain_id, gas_limit, base_fee, max_priority_fee, replace + ) + + # Build and sign transaction + signed_tx = self._build_and_sign_transaction( + w3, contract_func, indexer_addresses, data_bytes, tx_params, private_key + ) + + # Send transaction + return self._send_signed_transaction(w3, signed_tx) + + # If we get an error, log the error and raise an exception + except Exception as e: + logger.error(f"Error in _build_and_send_transaction: {e}") + raise + + + def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Dict) -> str: + """ + Execute the complete transaction process using a single RPC connection. + + Args: + w3: Web3 instance + contract: Contract instance + params: Dictionary containing all transaction parameters + + Returns: + str: Transaction hash + """ + # Extract parameters + private_key = params["private_key"] + contract_function = params["contract_function"] + indexer_addresses = params["indexer_addresses"] + data_bytes = params["data_bytes"] + sender_address = params["sender_address"] + chain_id = params["chain_id"] + replace = params["replace"] + + # Validate contract function exists + if not hasattr(contract.functions, contract_function): + raise ValueError(f"Contract {contract.address} does not have function: {contract_function}") + + contract_func = getattr(contract.functions, contract_function) + + # Log transaction details + logger.info(f"Contract address: {contract.address}") + logger.info(f"Contract function: {contract_function}") + logger.info(f"Number of indexers: {len(indexer_addresses)}") + logger.info(f"Data bytes length: {len(data_bytes)}") + logger.info(f"Chain ID: {chain_id}") + logger.info(f"Sender address: {sender_address}") + logger.info(f"Using RPC: {w3.provider.endpoint_uri}") + + # Check account balance + balance_wei = w3.eth.get_balance(sender_address) + balance_eth = w3.from_wei(balance_wei, "ether") + logger.info(f"Account balance: {balance_eth} ETH") + + # All transaction steps with the same RPC connection + gas_limit = self._estimate_transaction_gas(w3, contract_func, indexer_addresses, data_bytes, sender_address) + nonce = self._determine_transaction_nonce(w3, sender_address, replace) + tx_hash = self._build_and_send_transaction( + w3, + contract_func, + indexer_addresses, + data_bytes, + sender_address, + private_key, + chain_id, + gas_limit, + nonce, + replace, + ) + + # Wait for receipt with the same connection + try: + tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=30) + if tx_receipt["status"] == 1: + logger.info( + f"Transaction confirmed in block {tx_receipt['blockNumber']}, gas used: {tx_receipt['gasUsed']}" + ) + else: + logger.error(f"Transaction failed on-chain: {tx_hash}") + except Exception as e: + logger.warning(f"Could not get transaction receipt: {str(e)} (transaction may still be pending)") + + return tx_hash + + + def _execute_transaction_with_rpc_failover( + self, operation_name: str, operation_func: Callable, operation_params: Dict + ) -> Any: + """ + Execute a transaction operation with automatic RPC failover. + This function tries each RPC provider in sequence until one succeeds. + + Args: + operation_name: Human-readable name for the transaction operation + operation_func: Function that takes (w3, contract, operation_params) and executes the operation + operation_params: Parameters for the operation + + Returns: + Result of the operation_func + + Raises: + Exception: If all RPC providers fail + """ + # Initialize last_exception to None + last_exception = None + + # Try each RPC provider in sequence + for rpc_url in self.rpc_providers: + try: + # Log the attempt + logger.info(f"Attempting to do '{operation_name}' using RPC provider: {rpc_url}") + + # Get fresh connection for this rpc provider attempt + w3, contract, _ = self._get_working_web3_connection([rpc_url], self.contract_address, self.contract_abi) + + # Execute the operation with this rpc provider and return the result + return operation_func(w3, contract, operation_params) + + # If the operation fails, log the error and continue to the next rpc provider + except Exception as e: + logger.warning(f"{operation_name} failed with RPC provider {rpc_url}: {str(e)}") + last_exception = e + + # If we get here, all providers failed + logger.error(f"{operation_name} failed on all {len(self.rpc_providers)} RPC providers") + raise last_exception or Exception(f"All RPC providers failed for {operation_name}") + + + def send_transaction_to_allow_indexers( + self, + indexer_addresses: List[str], + private_key: str, + chain_id: int, + contract_function: str, + replace: bool = False, + data_bytes: bytes = b"", + ) -> str: + """ + Send a transaction to allow a subset of indexers to claim issuance rewards. + + Args: + indexer_addresses: List of indexer addresses to allow issuance + private_key: Private key for transaction signing + chain_id: Chain ID of the target blockchain + contract_function: Contract function name to call + replace: Flag to replace pending transactions + data_bytes: Optional bytes data to pass to contract function + + Returns: + str: Transaction hash + """ + # Set up account + temp_w3 = Web3() + sender_address = self._setup_transaction_account(private_key, temp_w3) + + # Convert addresses to checksum format + checksum_addresses = [Web3.to_checksum_address(addr) for addr in indexer_addresses] + + # Prepare all parameters for the transaction + transaction_params = { + "private_key": private_key, + "contract_function": contract_function, + "indexer_addresses": checksum_addresses, + "data_bytes": data_bytes, + "sender_address": sender_address, + "chain_id": chain_id, + "replace": replace, + } + + # Execute the transaction with RPC failover + try: + return self._execute_transaction_with_rpc_failover( + "Allow indexers to claim issuance", + self._execute_complete_transaction, + transaction_params, + ) + except Exception as e: + logger.error(f"Transaction failed on all RPC providers: {str(e)}") + raise + + + def batch_allow_indexers_issuance_eligibility( + self, + indexer_addresses: List[str], + private_key: str, + chain_id: int, + contract_function: str, + replace: bool = False, + batch_size: int = 250, + data_bytes: bytes = b"", + ) -> List[str]: + """ + Allow the issuance eligibility status of a list of indexers in batches. + + Args: + indexer_addresses: List of indexer addresses to allow + private_key: Private key for transaction signing + chain_id: Chain ID of the target blockchain + contract_function: Contract function name to call + replace: Optional flag to replace pending transactions + batch_size: Optional batch size for processing large lists + data_bytes: Optional bytes data to pass to contract function + + Returns: + List[str]: List of transaction hashes from successful batches + """ + # Validate function parameters + if not indexer_addresses: + logger.warning("No indexers provided to allow. Returning empty list.") + return [] + if batch_size <= 0: + raise ValueError("batch_size must be positive") + + # Calculate number of batches to process + total_indexers = len(indexer_addresses) + num_batches = (total_indexers + batch_size - 1) // batch_size + logger.info(f"Processing {total_indexers} indexers in {num_batches} batch(es) of {batch_size}") + + try: + tx_links = [] + # Validate and format private key + validated_private_key = validate_and_format_private_key(private_key) + + # Process each batch + for i in range(num_batches): + start_idx = i * batch_size + end_idx = min(start_idx + batch_size, total_indexers) + batch_indexers = indexer_addresses[start_idx:end_idx] + + logger.info(f"Processing batch {i+1}/{num_batches} with {len(batch_indexers)} indexers") + + # Try to send the transaction to the network (uses RPC failover) + try: + tx_hash = self.send_transaction_to_allow_indexers( + batch_indexers, + validated_private_key, + chain_id, + contract_function, + replace, + data_bytes, + ) + tx_links.append(f"https://sepolia.arbiscan.io/tx/{tx_hash}") + logger.info(f"Batch {i+1} transaction successful: {tx_hash}") + + # If we get an error, log the error and raise an exception + except Exception as e: + logger.error(f"Error processing batch {i+1} due to: {e}") + raise + + # Log all transaction links + for i, tx_link in enumerate(tx_links, 1): + logger.info(f"Transaction link {i} of {len(tx_links)}: {tx_link}") + + return tx_links + + except KeyValidationError as e: + logger.error(f"Private key validation failed: {e}") + raise ValueError(f"Invalid private key: {e}") from e From 011c71b80f0a0d9b045c6213a0edcf8e97f141b0 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:26:45 +0200 Subject: [PATCH 17/48] Update bigquery_data_access_provider.py --- src/models/bigquery_data_access_provider.py | 29 ++++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/models/bigquery_data_access_provider.py b/src/models/bigquery_data_access_provider.py index 8fd68e9..fe0e99a 100644 --- a/src/models/bigquery_data_access_provider.py +++ b/src/models/bigquery_data_access_provider.py @@ -10,12 +10,8 @@ from bigframes import pandas as bpd from pandera.typing import DataFrame -from tenacity import ( - retry, - retry_if_exception_type, - stop_after_attempt, - wait_exponential, -) + +from src.utils.retry_decorator import retry_with_backoff # Module-level logger logger = logging.getLogger(__name__) @@ -30,16 +26,17 @@ def __init__(self, project: str, location: str) -> None: bpd.options.bigquery.project = project bpd.options.display.progress_bar = None - @retry( - retry=retry_if_exception_type((ConnectionError, socket.timeout)), - stop=stop_after_attempt(10), - wait=wait_exponential(multiplier=1, max=60), - reraise=True, + @retry_with_backoff( + max_attempts=10, + min_wait=1, + max_wait=60, + exceptions=(ConnectionError, socket.timeout) ) def _read_gbq_dataframe(self, query: str) -> DataFrame: """ Execute a read query on Google BigQuery and return the results as a pandas DataFrame. - Retries up to stop_after_attempt times on connection errors with exponential backoff. + Retries up to max_attempts times on connection errors with exponential backoff. + Note: This method uses the bigframes.pandas.read_gbq function to execute the query. It relies on Application Default Credentials (ADC) for authentication, primarily using the @@ -53,9 +50,10 @@ def _read_gbq_dataframe(self, query: str) -> DataFrame: logger.warning(f"GOOGLE_APPLICATION_CREDENTIALS path not found: {creds_path}") logger.warning("Falling back to gcloud CLI user credentials.") else: - logger.info("Using enviroment variable $GOOGLE_APPLICATION_CREDENTIALS for authentication.") + logger.info("Using environment variable $GOOGLE_APPLICATION_CREDENTIALS for authentication.") else: logger.warning("GOOGLE_APPLICATION_CREDENTIALS not set, falling back to gcloud CLI user credentials") + # Execute the query with retry logic return cast(DataFrame, bpd.read_gbq(query).to_pandas()) @@ -70,9 +68,11 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st - Blocks behind <50,000, - Subgraph has >=500 GRT signal at query time Note: The 500 GRT curation signal requirement is not currently implemented. + Args: start_date (date): The start date for the data range. end_date (date): The end date for the data range. + Returns: str: SQL query string for indexer eligibility data. """ @@ -166,12 +166,15 @@ def fetch_indexer_issuance_eligibility_data(self, start_date: date, end_date: da """ Fetch data from Google BigQuery, used to determine indexer issuance eligibility, and compute each indexer's issuance eligibility status. + Depends on: - _get_indexer_eligibility_query() - _read_gbq_dataframe() + Args: start_date (date): The start date for the data to fetch from BigQuery. end_date (date): The end date for the data to fetch from BigQuery. + Returns: DataFrame: DataFrame containing a range of metrics for each indexer. The DataFrame contains the following columns: From e4936abb619aa96b152eeb87546321cc74efe78e Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:27:29 +0200 Subject: [PATCH 18/48] Update scheduler.py --- src/models/scheduler.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/models/scheduler.py b/src/models/scheduler.py index ac29d48..54cdff5 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -9,10 +9,8 @@ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential import src.models.service_quality_oracle as oracle -from src.models.issuance_data_access_helper import ( - _setup_google_credentials_in_memory_from_env_var, -) from src.utils.config_loader import load_config +from src.utils.config_manager import credential_manager from src.utils.slack_notifier import create_slack_notifier # Configure logging @@ -83,8 +81,9 @@ def run_oracle(force_date=None): today = force_date or datetime.now().date() start_time = datetime.now() logger.info(f"Starting Service Quality Oracle run at {start_time} for date {today}") + # Ensure we have valid google credentials before proceeding - _setup_google_credentials_in_memory_from_env_var() + credential_manager.setup_google_credentials() # Attempt to run the oracle try: @@ -184,7 +183,7 @@ def initialize(): validate_all_required_env_vars() # Validate credentials early to fail fast if there are issues - _setup_google_credentials_in_memory_from_env_var() + credential_manager.setup_google_credentials() # Load and validate configuration config = load_config() From 343924e679f68b5806788388895a346b1c3f4449 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:33:06 +0200 Subject: [PATCH 19/48] Update config_manager.py --- src/utils/config_manager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index 21f29d5..828c2e4 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -323,19 +323,22 @@ def validate_google_credentials(self) -> bool: Returns: bool: True if credentials are valid and working """ + # Try to validate the credentials try: import google.auth - - # Try to get default credentials credentials, project = google.auth.default() + # If the credentials are valid, log the success and return True if credentials: logger.info(f"Google credentials validated successfully for project: {project}") return True + + # If the credentials are not valid, log the error and return False else: logger.error("No valid Google credentials found") return False - + + # If the credentials could not be validated log the error except Exception as e: logger.error(f"Google credentials validation failed: {e}") return False From 05d14c0e2be7ccdf21c885e439cfc1b06b89e63c Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:33:15 +0200 Subject: [PATCH 20/48] Update service_quality_oracle.py --- src/models/service_quality_oracle.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 6c4b162..48f96eb 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -5,7 +5,6 @@ 2. Processing indexer data to determine eligibility 3. Submitting eligible indexers to the blockchain contract 4. Sending Slack notifications about run status -For blockchain interactions and data processing utilities, see issuance_data_access_helper.py. """ import logging @@ -17,13 +16,14 @@ # Add project root to path project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) sys.path.insert(0, project_root) + # Import data access utilities with absolute import from src.models.issuance_data_access_helper import ( - _setup_google_credentials_in_memory_from_env_var, batch_allow_indexers_issuance_eligibility_smart_contract, bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers, ) from src.utils.config_loader import load_config +from src.utils.config_manager import credential_manager from src.utils.slack_notifier import create_slack_notifier # Set up basic logging @@ -64,7 +64,7 @@ def main(): _ = google.auth.default() # If credentials could not be loaded, set them up in memory via helper function using environment variables except Exception: - _setup_google_credentials_in_memory_from_env_var() + credential_manager.setup_google_credentials() try: # Fetch + save indexer eligibility data and return eligible list as 'eligible_indexers' array @@ -91,7 +91,6 @@ def main(): if slack_notifier: # Calculate batch information for notification - config.get("BATCH_SIZE", 125) batch_count = len(transaction_links) if transaction_links else 0 total_processed = len(eligible_indexers) From 94e529571bfc8a48c57d4fc9a2e55fc98f934db7 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:43:57 +0200 Subject: [PATCH 21/48] Update blockchain_client.py --- src/models/blockchain_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index aceaa50..7b84340 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -578,7 +578,7 @@ def batch_allow_indexers_issuance_eligibility( chain_id: int, contract_function: str, replace: bool = False, - batch_size: int = 250, + batch_size: int = 125, data_bytes: bytes = b"", ) -> List[str]: """ From a48461ae7ba9783bd0a28756e0f3fc1c91ee5324 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:44:21 +0200 Subject: [PATCH 22/48] Update helper module after moving functions to their own modules --- src/models/issuance_data_access_helper.py | 933 +--------------------- 1 file changed, 44 insertions(+), 889 deletions(-) diff --git a/src/models/issuance_data_access_helper.py b/src/models/issuance_data_access_helper.py index ca659ba..a124056 100644 --- a/src/models/issuance_data_access_helper.py +++ b/src/models/issuance_data_access_helper.py @@ -1,870 +1,58 @@ """ -Helper module containing utility functions related to data access and processing -for the Service Quality Oracle. +Helper module containing high-level functions for the Service Quality Oracle. + +This module focuses on: +- Main data processing +- BigQuery data fetching and processing +- Integration between different components """ -import json import logging -import os -import shutil -from datetime import date, datetime -from pathlib import Path -from typing import Any +from datetime import date +from typing import List -import pandas as pd from tenacity import retry, stop_after_attempt, wait_exponential -from web3 import Web3 -from web3.contract import Contract -# Import data providers from src.models.bigquery_data_access_provider import BigQueryProvider -from src.models.subgraph_data_access_provider import SubgraphProvider - -# Import configuration and key validation -from src.utils.config_loader import ConfigLoader, ConfigurationError -from src.utils.key_validator import KeyValidationError, validate_and_format_private_key +from src.models.data_processor import DataProcessor +from src.utils.config_manager import ConfigManager logger = logging.getLogger(__name__) -# ============================================================================= -# CONFIGURATION AND SETUP FUNCTIONS -# ============================================================================= -def _validate_required_fields(data: dict, required_fields: list[str], context: str) -> None: - """ - Helper function to validate required fields are present in a dictionary. - Args: - data: Dictionary to validate - required_fields: List of required fields - context: Context for error message - Raises: - ValueError: If required fields are missing - """ - # Check if any required fields are missing from the data dictionary - missing_fields = [field for field in required_fields if field not in data] - if missing_fields: - raise ValueError(f"{context}: missing {missing_fields}") - - -def _load_config_and_return_validated() -> dict[str, Any]: - """ - Load all necessary configurations using config loader, validate, and return them. - # TODO: check config file return dict format correct (also in other functions throughout the codebase) - Returns: - Dict[str, Any]: Config dictionary with validated and converted values. - { - "bigquery_project_id": str, - "bigquery_location": str, - "rpc_providers": list[str], - "contract_address": str, - "contract_function": str, - "chain_id": int, - "scheduled_run_time": str, - "batch_size": int, - "max_age_before_deletion": int, - } - Raises: - ConfigurationError: If configuration loading fails - ValueError: If configuration validation fails - """ - try: - # Load configuration using config loader - loader = ConfigLoader() - config = loader.get_flat_config() - logger.info("Successfully loaded configuration") - # Validate and convert chain_id to integer - if config.get("chain_id"): - try: - config["chain_id"] = int(config["chain_id"]) - except ValueError as e: - raise ValueError(f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer.") from e - # Validate scheduled run time format (HH:MM) - if config.get("scheduled_run_time"): - try: - datetime.strptime(config["scheduled_run_time"], "%H:%M") - except ValueError as e: - raise ValueError( - f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - " - "must be in HH:MM format" - ) from e - # Validate blockchain configuration contains all required fields - required_fields = [ - "private_key", - "contract_address", - "contract_function", - "chain_id", - "scheduled_run_time", - ] - _validate_required_fields(config, required_fields, "Missing required blockchain configuration") - # Validate RPC providers - if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): - raise ValueError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") - return config - except ConfigurationError: - raise - except Exception as e: - raise ConfigurationError(f"Configuration validation failed: {e}") from e - - -def _get_path_to_project_root() -> Path: - """ - Get the path to the project root directory. - In Docker environments, use /app. Otherwise, find by marker files. - """ - # Use the /app directory as the project root if it exists - docker_path = Path("/app") - if docker_path.exists(): - return docker_path - # If the /app directory doesn't exist fall back to secondary detection logic - current_path = Path(__file__).parent - while current_path != current_path.parent: - if (current_path / ".gitignore").exists() or (current_path / "pyproject.toml").exists(): - logger.info(f"Found project root at: {current_path}") - return current_path - # Attempt to traverse upwards (will not work if the directory has no parent) - current_path = current_path.parent - # If we got here, something is wrong - raise FileNotFoundError("Could not find project root directory. Investigate.") - - -def _parse_and_validate_credentials_json(creds_env: str) -> dict: - """ - Parse and validate Google credentials JSON from environment variable. - Args: - creds_env: JSON string containing credentials - Returns: - dict: Parsed and validated credentials data - Raises: - ValueError: If JSON is invalid or credentials are incomplete - """ - # Try to parse the credentials JSON - try: - creds_data = json.loads(creds_env) - cred_type = creds_data.get("type", "") - # Validate the credentials data based on the type - if cred_type == "authorized_user": - required_fields = ["client_id", "client_secret", "refresh_token"] - _validate_required_fields(creds_data, required_fields, "Incomplete authorized_user credentials") - elif cred_type == "service_account": - required_fields = ["private_key", "client_email", "project_id"] - _validate_required_fields(creds_data, required_fields, "Incomplete service_account credentials") - else: - raise ValueError( - f"Unsupported credential type: '{cred_type}'. Expected 'authorized_user' or 'service_account'" - ) - # If the JSON is invalid, log an error and raise a ValueError - except Exception as e: - logger.error(f"Failed to parse and validate credentials JSON: {e}") - raise ValueError(f"Invalid credentials JSON: {e}") from e - # Return the parsed and validated credentials data - return creds_data - - -def _setup_user_credentials_in_memory(creds_data: dict) -> None: - """Set up user account credentials directly in memory.""" - import google.auth - from google.oauth2.credentials import Credentials - - try: - credentials = Credentials( - token=None, - refresh_token=creds_data.get("refresh_token"), - client_id=creds_data.get("client_id"), - client_secret=creds_data.get("client_secret"), - token_uri="https://oauth2.googleapis.com/token", - ) - # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] - logger.info("Successfully loaded user account credentials from environment variable") - finally: - # Clear sensitive data from local scope - if "creds_data" in locals(): - creds_data.clear() - - -def _setup_service_account_credentials_in_memory(creds_data: dict) -> None: - """Set up service account credentials directly in memory.""" - import google.auth - from google.oauth2 import service_account - - try: - # Create credentials object directly from dict - credentials = service_account.Credentials.from_service_account_info(creds_data) - # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] - logger.info("Successfully loaded service account credentials from environment variable") - except Exception as e: - logger.error(f"Failed to create service account credentials: {e}") - raise ValueError(f"Invalid service account credentials: {e}") from e - finally: - # Clear sensitive data from local scope - if "creds_data" in locals(): - creds_data.clear() - - -def _setup_google_credentials_in_memory_from_env_var(): - """ - Set up Google credentials directly in memory from environment variable. - This function handles multiple credential formats securely: - 1. JSON string in GOOGLE_APPLICATION_CREDENTIALS (inline credentials) - 2. Automatic fallback to gcloud CLI authentication - """ - # Get the account credentials from the environment variable - creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") - # If the credentials are not set, log a warning and return - if not creds_env: - logger.warning( - "GOOGLE_APPLICATION_CREDENTIALS not set. Falling back to gcloud CLI user credentials if available" - ) - return - # Case 1: JSON credentials provided inline - if creds_env.startswith("{"): - creds_data = None - try: - # Parse and validate credentials - creds_data = _parse_and_validate_credentials_json(creds_env) - cred_type = creds_data.get("type") - # Set up credentials based on type - if cred_type == "authorized_user": - _setup_user_credentials_in_memory(creds_data.copy()) - elif cred_type == "service_account": - _setup_service_account_credentials_in_memory(creds_data.copy()) - # If the credentials are invalid, log an error and raise a ValueError - except Exception as e: - logger.error("Failed to set up credentials from environment variable") - raise ValueError(f"Error processing inline credentials: {e}") from e - # Clear the original credentials dict from memory if it exists - finally: - if creds_data is not None: - creds_data.clear() - del creds_data - else: - logger.warning( - "GOOGLE_APPLICATION_CREDENTIALS is not set or not in the correct format. " - "Falling back to gcloud CLI authentication if available" - ) - return - - -# ============================================================================= -# DATA PROCESSING UTILITY FUNCTIONS -# ============================================================================= -def _export_bigquery_data_as_csvs_and_return_lists_of_ineligible_and_eligible_indexers( - input_data_from_bigquery: pd.DataFrame, output_date_dir: Path -) -> tuple[list, list]: - """ - Export BigQuery data as CSVs and return lists of eligible/ineligible indexers. - Args: - input_data_from_bigquery: Indexer data returned from BigQuery - output_date_dir: Path to date directory for output files - Returns: - Tuple[list, list]: Two lists of indexer addresses, eligible and ineligible - """ - # Ensure the output directory exists, creating parent directories if necessary - output_date_dir.mkdir(exist_ok=True, parents=True) - # Save raw data - raw_data_path = output_date_dir / "indexer_issuance_eligibility_data.csv" - input_data_from_bigquery.to_csv(raw_data_path, index=False) - logger.info(f"Saved raw bigquery results df to: {raw_data_path}") - # Filter eligible and ineligible indexers - eligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 1] - ineligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 0] - # Save filtered data - eligible_path = output_date_dir / "eligible_indexers.csv" - ineligible_path = output_date_dir / "ineligible_indexers.csv" - eligible_df[["indexer"]].to_csv(eligible_path, index=False) - ineligible_df[["indexer"]].to_csv(ineligible_path, index=False) - # Return lists of eligible and ineligible indexers - return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() - - -def _clean_old_date_directories(data_output_dir: Path, max_age_before_deletion: int): - """ - Remove old date directories to prevent unlimited growth. - Args: - data_output_dir: Path to the output directory - max_age_before_deletion: Maximum age in days before deleting data output - """ - today = date.today() - output_path = Path(data_output_dir) - # Only process directories with date format YYYY-MM-DD - for item in output_path.iterdir(): - if not item.is_dir(): - continue - try: - # Try to parse the directory name as a date - dir_date = datetime.strptime(item.name, "%Y-%m-%d").date() - age_days = (today - dir_date).days - # Remove if older than max_age_before_deletion - if age_days > max_age_before_deletion: - logger.info(f"Removing old data directory: {item} ({age_days} days old)") - shutil.rmtree(item) - # Skip directories that don't match date format - except ValueError: - continue - - -# ============================================================================= -# BLOCKCHAIN UTILITY FUNCTIONS (LOW-LEVEL) -# ============================================================================= -def _load_contract_abi() -> list[dict]: - """Load the contract ABI from the contracts directory.""" - try: - project_root = _get_path_to_project_root() - abi_path = project_root / "contracts" / "contract.abi.json" - with open(abi_path) as f: - return json.load(f) - # If the ABI file cannot be loaded, raise an error - except Exception as e: - logger.error(f"Failed to load contract ABI: {str(e)}") - raise - - -def _get_working_web3_connection( - rpc_providers: list[str], contract_address: str, contract_abi: list[dict] -) -> tuple[Web3, Contract, str]: - """ - Try connecting to RPC providers until one works. - Args: - rpc_providers: List of RPC provider URLs to try connecting to - contract_address: Contract address for creating contract instance - contract_abi: Contract ABI for creating contract instance - Returns: - Tuple[Web3, Contract, str]: Working web3 instance, contract instance, and provider URL - Raises: - ConnectionError: If all RPC providers fail - """ - for i, rpc_url in enumerate(rpc_providers): - try: - provider_type = "primary" if i == 0 else f"backup #{i}" - logger.info(f"Attempting to connect to {provider_type} RPC provider: {rpc_url}") - w3 = Web3(Web3.HTTPProvider(rpc_url)) - # Test connection - if w3.is_connected(): - logger.info(f"Successfully connected to {provider_type} RPC provider") - # Create contract instance and return web3 instance, contract instance, and provider URL - contract = w3.eth.contract(address=Web3.to_checksum_address(contract_address), abi=contract_abi) - return w3, contract, rpc_url - else: - logger.warning(f"Could not connect to {provider_type} RPC provider: {rpc_url}") - except Exception as e: - provider_type = "primary" if i == 0 else f"backup #{i}" - logger.warning(f"Error connecting to {provider_type} RPC provider {rpc_url}: {str(e)}") - # If we get here, all providers failed - raise ConnectionError(f"Failed to connect to any of {len(rpc_providers)} RPC providers: {rpc_providers}") - - -def _setup_transaction_account(private_key: str, w3) -> tuple[str, object]: - """ - Get the address of the account from the private key. - Args: - private_key: Private key for the account - w3: Web3 instance - Returns: - str: Address of the account - """ - try: - account = w3.eth.account.from_key(private_key) - logger.info(f"Using account: {account.address}") - return account.address - # If the account cannot be retrieved, log the error and raise an exception - except Exception as e: - logger.error(f"Failed to retrieve account from private key: {str(e)}") - raise - - -def _estimate_transaction_gas( - w3, contract_func, indexer_addresses: list[str], data_bytes: bytes, sender_address: str -) -> int: - """ - Estimate gas for the transaction with 25% buffer. - Args: - w3: Web3 instance - contract_func: Contract function to call - indexer_addresses: List of indexer addresses - data_bytes: Data bytes for the transaction - sender_address: Transaction sender address - Returns: - int: Estimated gas with 25% buffer - """ - # Try to estimate the gas for the transaction - try: - estimated_gas = contract_func(indexer_addresses, data_bytes).estimate_gas({"from": sender_address}) - gas_limit = int(estimated_gas * 1.25) # 25% buffer - logger.info(f"Estimated gas: {estimated_gas}, with buffer: {gas_limit}") - return gas_limit - # If the gas estimation fails, raise an error - except Exception as e: - logger.error(f"Gas estimation failed: {str(e)}") - raise - - -def _determine_transaction_nonce(w3, sender_address: str, replace: bool) -> int: - """ - Determine the appropriate nonce for the transaction. - Args: - w3: Web3 instance - sender_address: Transaction sender address - replace: Whether to replace pending transactions - Returns: - int: Transaction nonce to use - """ - # If we are not replacing a pending transaction, use the next available nonce - if not replace: - nonce = w3.eth.get_transaction_count(sender_address) - logger.info(f"Using next available nonce: {nonce}") - return nonce - # If we are replacing a pending transaction, try to find and replace it - logger.info("Attempting to find and replace a pending transaction") - # Try to find pending transactions - try: - pending_txs = w3.eth.get_block("pending", full_transactions=True) - sender_pending_txs = [ - tx for tx in pending_txs.transactions if hasattr(tx, "from") and tx["from"] == sender_address - ] - # If we found pending transactions, use the nonce of the first pending transaction - if sender_pending_txs: - sender_pending_txs.sort(key=lambda x: x["nonce"]) - nonce = sender_pending_txs[0]["nonce"] - logger.info(f"Found pending transaction with nonce {nonce} for replacement") - return nonce - # If we could not find pending transactions log a warning - except Exception as e: - logger.warning(f"Could not check pending transactions: {str(e)}") - # Check for nonce gaps - try: - current_nonce = w3.eth.get_transaction_count(sender_address, "pending") - latest_nonce = w3.eth.get_transaction_count(sender_address, "latest") - if current_nonce > latest_nonce: - logger.info(f"Detected nonce gap: latest={latest_nonce}, pending={current_nonce}") - return latest_nonce - except Exception as e: - logger.warning(f"Could not check nonce gap: {str(e)}") - # Fallback to next available nonce - nonce = w3.eth.get_transaction_count(sender_address) - logger.info(f"Using next available nonce: {nonce}") - return nonce - - -def _get_gas_prices(w3, replace: bool) -> tuple[int, int]: - """Get base fee and max priority fee for transaction.""" - # Get current gas prices with detailed logging - try: - latest_block = w3.eth.get_block("latest") - base_fee = latest_block["baseFeePerGas"] - logger.info(f"Latest block base fee: {base_fee/1e9:.2f} gwei") - # If the base fee cannot be retrieved, use a fallback value - except Exception as e: - logger.warning(f"Could not get base fee: {e}") - base_fee = w3.to_wei(10, "gwei") - try: - max_priority_fee = w3.eth.max_priority_fee - logger.info(f"Max priority fee: {max_priority_fee/1e9:.2f} gwei") - except Exception as e: - logger.warning(f"Could not get max priority fee: {e}") - max_priority_fee = w3.to_wei(2, "gwei") # fallback - return base_fee, max_priority_fee - - -def _build_transaction_params( - sender_address: str, - nonce: int, - chain_id: int, - gas_limit: int, - base_fee: int, - max_priority_fee: int, - replace: bool, -) -> dict: - """Build transaction parameters with appropriate gas prices.""" - tx_params = {"from": sender_address, "nonce": nonce, "chainId": chain_id, "gas": gas_limit} - # Set gas prices (higher for replacement transactions) - if replace: - max_fee_per_gas = base_fee * 4 + max_priority_fee * 2 - max_priority_fee_per_gas = max_priority_fee * 2 - tx_params["maxFeePerGas"] = max_fee_per_gas - tx_params["maxPriorityFeePerGas"] = max_priority_fee_per_gas - logger.info(f"High gas for replacement: {max_fee_per_gas/1e9:.2f} gwei") - # If we are not replacing a pending transaction, use a lower gas price - else: - max_fee_per_gas = base_fee * 2 + max_priority_fee - max_priority_fee_per_gas = max_priority_fee - tx_params["maxFeePerGas"] = max_fee_per_gas - tx_params["maxPriorityFeePerGas"] = max_priority_fee_per_gas - logger.info(f"Standard gas: {max_fee_per_gas/1e9:.2f} gwei") - logger.info(f"Transaction parameters: nonce={nonce}, gas={gas_limit}, chain_id={chain_id}") - return tx_params - - -def _build_and_sign_transaction( - w3, contract_func, indexer_addresses: list[str], data_bytes: bytes, tx_params: dict, private_key: str -): - """Build and sign the transaction.""" - # Attempt to build the transaction - try: - transaction = contract_func(indexer_addresses, data_bytes).build_transaction(tx_params) - logger.info("Transaction built successfully") - # If the transaction cannot be built, log the error and raise an exception - except Exception as e: - logger.error(f"Failed to build transaction: {e}") - logger.error(f"Contract function: {contract_func}") - logger.error(f"Indexer addresses count: {len(indexer_addresses)}") - logger.error(f"Data bytes length: {len(data_bytes)}") - logger.error(f"Transaction params: {tx_params}") - raise - # Attempt to sign the transaction - try: - signed_tx = w3.eth.account.sign_transaction(transaction, private_key) - logger.info("Transaction signed successfully") - return signed_tx - # If the transaction cannot be signed, log the error and raise an exception - except Exception as e: - logger.error(f"Failed to sign transaction: {e}") - raise - - -def _handle_transaction_error(error_msg: str) -> None: - """Handle and log specific transaction error types.""" - if "insufficient funds" in error_msg.lower(): - logger.error("Insufficient funds to pay for gas") - elif "nonce too low" in error_msg.lower(): - logger.error("Nonce is too low - transaction may have already been sent") - elif "nonce too high" in error_msg.lower(): - logger.error("Nonce is too high - there may be pending transactions") - elif "gas" in error_msg.lower(): - logger.error("Gas-related issue - transaction may consume too much gas") - elif "400" in error_msg: - logger.error("HTTP 400 Bad Request - RPC provider rejected the request") - - -def _send_signed_transaction(w3, signed_tx) -> str: - """Send the signed transaction and handle errors.""" - # Attempt to send the transaction to the network - try: - tx_hash = w3.eth.send_raw_transaction(signed_tx.rawTransaction) - logger.info(f"Transaction sent! Hash: {tx_hash.hex()}") - return tx_hash.hex() - # If the transaction could not be sent, log the error and raise an exception - except ValueError as e: - error_msg = str(e) - logger.error(f"Transaction rejected by network: {error_msg}") - _handle_transaction_error(error_msg) - raise - except Exception as e: - logger.error(f"Unexpected error sending transaction: {e}") - logger.error(f"Error type: {type(e).__name__}") - raise - - -def _build_and_send_transaction( - w3, - contract_func, - indexer_addresses: list[str], - data_bytes: bytes, - sender_address: str, - private_key: str, - chain_id: int, - gas_limit: int, - nonce: int, - replace: bool, -) -> str: - """ - Build, sign, and send the transaction. - Args: - w3: Web3 instance - contract_func: Contract function to call - indexer_addresses: List of indexer addresses - data_bytes: Data bytes for transaction - sender_address: Transaction sender address - private_key: Private key for signing - chain_id: Chain ID - gas_limit: Gas limit for transaction - nonce: Transaction nonce - replace: Whether this is a replacement transaction - Returns: - str: Transaction hash - """ - try: - # Get gas prices - base_fee, max_priority_fee = _get_gas_prices(w3, replace) - # Build transaction parameters - tx_params = _build_transaction_params( - sender_address, nonce, chain_id, gas_limit, base_fee, max_priority_fee, replace - ) - # Build and sign transaction - signed_tx = _build_and_sign_transaction( - w3, contract_func, indexer_addresses, data_bytes, tx_params, private_key - ) - # Send transaction - return _send_signed_transaction(w3, signed_tx) - except Exception as e: - logger.error(f"Error in _build_and_send_transaction: {e}") - raise - - -# ============================================================================= -# BLOCKCHAIN TRANSACTION FUNCTIONS (MID-LEVEL) -# ============================================================================= -def _execute_transaction_with_rpc_failover( - operation_name: str, rpc_providers: list[str], contract_address: str, operation_func, operation_params: dict -): - """ - Execute a transaction operation with automatic RPC failover. - This function tries each RPC provider in sequence until one succeeds. - If an RPC fails during any part of the transaction process, it moves to the next one. - Args: - operation_name: Human-readable name for the transaction operation, used for logging purposes - rpc_providers: List of RPC provider URLs to try connecting to - contract_address: Contract address - operation_func: Function that takes (w3, contract, operation_params) and does 'operation_name' operation - default 'operation_func' is _execute_complete_transaction() - operation_params: Parameters for the operation, e.g. - { - "private_key": private_key, - "contract_function": contract_function, - "indexer_addresses": indexer_addresses, - "data_bytes": data_bytes, - "sender_address": sender_address, - "account": account, - "chain_id": chain_id, - "replace": replace - } - Returns: - Result of the operation_func - Raises: - Exception: If all RPC providers fail - """ - # Initialize last_exception to None - last_exception = None - for rpc_url in rpc_providers: - try: - # Log the attempt - logger.info(f"Attempting to do '{operation_name}' using RPC provider: {rpc_url}") - # Get fresh connection for this rpc provider attempt - w3, contract, _ = _get_working_web3_connection([rpc_url], contract_address, _load_contract_abi()) - # Execute the operation with this rpc provider and return the result - return operation_func(w3, contract, operation_params) - # If the operation fails, log the error and continue to the next rpc provider - except Exception as e: - logger.warning(f"{operation_name} failed with RPC provider {rpc_url}: {str(e)}") - # Store the exception for later use - last_exception = e - # If we get here, all providers failed - logger.error(f"{operation_name} failed on all {len(rpc_providers)} RPC providers") - raise last_exception or Exception(f"All RPC providers failed for {operation_name}") - - -def _execute_complete_transaction(w3, contract, params): - """ - Execute the complete transaction process using a single RPC connection. - Args: - w3: Web3 instance - contract: Contract instance - params: Dictionary containing all transaction parameters - Returns: - str: Transaction hash - """ - # Extract parameters - private_key = params["private_key"] - contract_function = params["contract_function"] - indexer_addresses = params["indexer_addresses"] - data_bytes = params["data_bytes"] - sender_address = params["sender_address"] - chain_id = params["chain_id"] - replace = params["replace"] - # Validate contract function exists - if not hasattr(contract.functions, contract_function): - raise ValueError(f"Contract {contract.address} does not have function: {contract_function}") - contract_func = getattr(contract.functions, contract_function) - # Log transaction details - logger.info(f"Contract address: {contract.address}") - logger.info(f"Contract function: {contract_function}") - logger.info(f"Number of indexers: {len(indexer_addresses)}") - logger.info(f"Data bytes length: {len(data_bytes)}") - logger.info(f"Chain ID: {chain_id}") - logger.info(f"Sender address: {sender_address}") - logger.info(f"Using RPC: {w3.provider.endpoint_uri}") - # Check account balance - balance_wei = w3.eth.get_balance(sender_address) - balance_eth = w3.from_wei(balance_wei, "ether") - logger.info(f"Account balance: {balance_eth} ETH") - # All transaction steps with the same RPC connection - gas_limit = _estimate_transaction_gas(w3, contract_func, indexer_addresses, data_bytes, sender_address) - nonce = _determine_transaction_nonce(w3, sender_address, replace) - tx_hash = _build_and_send_transaction( - w3, - contract_func, - indexer_addresses, - data_bytes, - sender_address, - private_key, - chain_id, - gas_limit, - nonce, - replace, - ) - # Wait for receipt with the same connection - try: - tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=30) - if tx_receipt["status"] == 1: - logger.info( - f"Transaction confirmed in block {tx_receipt['blockNumber']}, gas used: {tx_receipt['gasUsed']}" - ) - else: - logger.error(f"Transaction failed on-chain: {tx_hash}") - except Exception as e: - logger.warning(f"Could not get transaction receipt: {str(e)} (transaction may still be pending)") - return tx_hash - - -def _send_transaction_to_allow_indexers_in_list_to_claim_issuance( - list_of_indexers_that_can_claim_issuance: list[str], - private_key: str, - chain_id: int, - rpc_providers: list[str], - contract_address: str, - contract_function: str, - replace: bool = False, - data_bytes: bytes = b"", -) -> str: - """ - Send a transaction to the indexer eligibility oracle contract to allow a subset of indexers - to claim issuance rewards. - This function builds, signs, and sends a transaction to the blockchain using RPC failover. - This function is called by the batch_allow_indexers_issuance_eligibility_smart_contract function, which handles - batching of transactions if the list before input into this function. - Args: - list_of_indexers_that_can_claim_issuance: List of indexer addresses to allow issuance - private_key: Private key for transaction signing - chain_id: Chain ID of the target blockchain - rpc_providers: List of RPC provider URLs (primary + backups) - contract_address: Contract address - contract_function: Contract function name to call - replace: Flag to replace pending transactions - data_bytes: Optional bytes data to pass to contract function - Returns: - str: Transaction hash - """ - # Set up account - from web3 import Web3 - - temp_w3 = Web3() - sender_address = _setup_transaction_account(private_key, temp_w3) - # Convert addresses to checksum format - checksum_addresses = [Web3.to_checksum_address(addr) for addr in list_of_indexers_that_can_claim_issuance] - # Prepare all parameters for the transaction - transaction_params = { - "private_key": private_key, - "contract_function": contract_function, - "indexer_addresses": checksum_addresses, - "data_bytes": data_bytes, - "sender_address": sender_address, - "chain_id": chain_id, - "replace": replace, - } - # Execute the transaction to allow indexers to claim issuance with RPC failover - try: - return _execute_transaction_with_rpc_failover( - "Allow indexers to claim issuance", - rpc_providers, - contract_address, - _execute_complete_transaction, - transaction_params, - ) - except Exception as e: - logger.error(f"Transaction failed on all RPC providers: {str(e)}") - raise - - -# ============================================================================= -# HIGH-LEVEL BATCH TRANSACTION FUNCTION -# ============================================================================= -def batch_allow_indexers_issuance_eligibility_smart_contract( - list_of_indexers_to_allow: list[str], replace: bool = False, batch_size: int = 250, data_bytes: bytes = b"" -) -> list[str]: - """ - Allow the issuance eligibility status of a list of indexers in the smart contract. - This function handles batching of transactions if the list is too large for a single - transaction, and uses key validation for private keys. - Args: - list_of_indexers_to_allow: List of indexer addresses to allow - replace: Optional flag to replace pending transactions - batch_size: Optional batch size for processing large lists - data_bytes: Optional bytes data to pass to contract_address:contract_function - Returns: - List[str]: List of transaction hashes from successful batches - Raises: - ConfigurationError: If configuration loading fails - ValueError: If configuration is invalid - ConnectionError: If unable to connect to any RPC providers - Exception: If transaction processing fails - """ - # Get config - config = _load_config_and_return_validated() - # Validate function parameters look correct - if not list_of_indexers_to_allow: - logger.warning("No indexers provided to allow. Returning empty list.") - return [] - if batch_size <= 0: - raise ValueError("batch_size must be positive") - # Calculate number of batches to process - total_indexers_to_allow = len(list_of_indexers_to_allow) - num_batches = (total_indexers_to_allow + batch_size - 1) // batch_size - logger.info(f"Processing {total_indexers_to_allow} indexers in {num_batches} batch(es) of {batch_size}") - try: - tx_links = [] - # Validate and format private key - private_key = validate_and_format_private_key(str(config["private_key"])) - # Process each batch - for i in range(num_batches): - start_idx = i * batch_size - end_idx = min(start_idx + batch_size, total_indexers_to_allow) - batch_indexers = list_of_indexers_to_allow[start_idx:end_idx] - logger.info(f"Processing batch {i+1}/{num_batches} with {len(batch_indexers)} indexers") - try: - tx_hash = _send_transaction_to_allow_indexers_in_list_to_claim_issuance( - batch_indexers, - private_key, - int(config["chain_id"]), - list(config["rpc_providers"]), - str(config["contract_address"]), - str(config["contract_function"]), - replace, - data_bytes, - ) - tx_links.append(f"https://sepolia.arbiscan.io/tx/{tx_hash}") - logger.info(f"Batch {i+1} transaction successful: {tx_hash}") - except Exception as e: - logger.error(f"Error processing batch {i+1} due to: {e}") - # Print all the transaction links - for i, tx_link in enumerate(tx_links, 1): - logger.info(f"Transaction link {i} of {len(tx_links)}: {tx_link}") - return tx_links - except KeyValidationError as e: - logger.error(f"Private key validation failed: {e}") - raise ValueError(f"Invalid private key: {e}") from e - - -# ============================================================================= -# MAIN BIGQUERY DATA PROCESSING FUNCTION -# ============================================================================= @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=30, max=120), reraise=True) def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers( start_date: date, end_date: date, current_date: date, max_age_before_deletion: int, -) -> list[str]: +) -> List[str]: """ Main function to fetch and process data from BigQuery. + + Args: + start_date: Start date for BigQuery data + end_date: End date for BigQuery data + current_date: Current date for output directory + max_age_before_deletion: Maximum age in days before deleting old data + Returns: List[str]: List of indexers that should be allowed issuance based on BigQuery data """ - # Load config using secure configuration loader - config = _load_config_and_return_validated() - # Initialize the BigQuery provider class so we can use its methods to fetch data from BigQuery + # Load configuration + config_manager = ConfigManager() + config = config_manager.load_and_validate_config() + project_root = config_manager.get_project_root() + + # Initialize bigquery provider bq_provider = BigQueryProvider( - project=str(config["bigquery_project_id"]), location=str(config["bigquery_location"]) + project=str(config["bigquery_project_id"]), + location=str(config["bigquery_location"]) ) + + # Initialize data processor + data_processor = DataProcessor(project_root) + try: # Fetch eligibility dataframe logger.info(f"Fetching eligibility data between {start_date} and {end_date}") @@ -872,66 +60,33 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli start_date, end_date ) logger.info(f"Retrieved issuance eligibility data for {len(indexer_issuance_eligibility_data)} indexers") - # Store the output directory paths as variables so we can pass them to other functions - output_dir = _get_path_to_project_root() / "data" / "output" - date_dir = output_dir / current_date.strftime("%Y-%m-%d") - # Export separate lists for eligible and ineligible indexers + + # Get output directory for current date + date_dir = data_processor.get_date_output_directory(current_date) + + # Export data and get indexer lists logger.info(f"Attempting to export indexer issuance eligibility lists to: {date_dir}") eligible_indexers, ineligible_indexers = ( - _export_bigquery_data_as_csvs_and_return_lists_of_ineligible_and_eligible_indexers( + data_processor.export_bigquery_data_as_csvs_and_return_indexer_lists( indexer_issuance_eligibility_data, date_dir ) ) logger.info("Exported indexer issuance eligibility lists.") + # Clean old eligibility lists logger.info("Cleaning old eligibility lists.") - _clean_old_date_directories(output_dir, max_age_before_deletion) + data_processor.clean_old_date_directories(max_age_before_deletion) + # Log final summary logger.info(f"Processing complete. Output available at: {date_dir}") - # Log the number of eligible indexers logger.info( - f"No. of elig. indxrs. to insert into smart contract on {date.today()} is: {len(eligible_indexers)}" + f"No. of eligible indexers to insert into smart contract on {date.today()} is: {len(eligible_indexers)}" ) + # Return list of indexers that should be allowed issuance return eligible_indexers + except Exception as e: - logger.error(f"Error processing data: {str(e)}", exc_info=True) - raise - - -# ============================================================================= -# FUTURE FUNCTIONS (NOT USED YET) -# ============================================================================= -def _fetch_issuance_enabled_indexers_from_subgraph() -> list[str]: - """ - TODO: fix this once we have the subgraph - Queries the indexer eligibility subgraph to get the list of indexers that are - currently allowed issuance. - Returns: - List[str]: A list of indexer addresses that are currently allowed issuance - """ - # Load config and check that the necessary variables are set - config = _load_config_and_return_validated() - subgraph_url = config.get("subgraph_url") - studio_api_key = config.get("studio_api_key") - if not subgraph_url: - raise ValueError("SUBGRAPH_URL_PRODUCTION not set in configuration") - if not studio_api_key: - raise ValueError("STUDIO_API_KEY not set in configuration") - logger.info("Configuration for subgraph query loaded successfully.") - try: - # Initialize the subgraph provider class so we can use its methods to fetch data from our subgraph - subgraph_provider = SubgraphProvider() - # Fetch all indexers from the subgraph - indexers_data = subgraph_provider.fetch_all_indexers() - logger.info(f"Retrieved data for {len(indexers_data)} indexers from subgraph") - # Extract currently denied indexers (those where isDenied is True) - allowed_indexers = [] - for indexer in indexers_data: - if indexer.get("isDenied", False): - allowed_indexers.append(indexer["id"].lower()) - logger.info(f"Found {len(allowed_indexers)} indexers that are currently allowed issuance") - return allowed_indexers - except Exception as e: - logger.error(f"Error fetching allowed indexers from subgraph: {str(e)}", exc_info=True) + logger.error(f"Transaction failed on all RPC providers: {str(e)}") raise + \ No newline at end of file From 388a538cb15866ae430b931857578f59897ffe99 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 00:46:02 +0200 Subject: [PATCH 23/48] feat: Overhaul Ruff Settings Update issuance_data_access_helper.py Update ruff_check_format_assets.sh update ruff settings Ruff selective turn off ruff ruff linting linting Update custom_formatter.py Update service_quality_oracle.py Update custom_formatter.py Update retry_decorator.py Update custom_formatter.py Update retry_decorator.py --- pyproject.toml | 13 +- scripts/custom_formatter.py | 133 +++++++++++++++++ scripts/ruff_check_format_assets.sh | 21 ++- src/models/bigquery_data_access_provider.py | 18 ++- src/models/blockchain_client.py | 150 +++++++++++--------- src/models/data_processor.py | 26 ++-- src/models/issuance_data_access_helper.py | 27 ++-- src/models/scheduler.py | 11 +- src/models/service_quality_oracle.py | 4 +- src/models/subgraph_data_access_provider.py | 13 +- src/utils/config_loader.py | 27 ++-- src/utils/config_manager.py | 70 ++++----- src/utils/retry_decorator.py | 13 +- src/utils/slack_notifier.py | 6 +- 14 files changed, 348 insertions(+), 184 deletions(-) create mode 100644 scripts/custom_formatter.py diff --git a/pyproject.toml b/pyproject.toml index 93daa9e..2ea9859 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,13 @@ target-version = "py39" fix = true fix-only = false +[tool.ruff.format] +# Format SQL code in strings/docstrings +docstring-code-format = false +quote-style = "double" +indent-style = "space" +line-ending = "lf" + [tool.ruff.lint] # Enable rules including isort (I) for import sorting and additional fixes select = ["E", "W", "F", "I"] @@ -44,12 +51,6 @@ known-first-party = ["src"] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 -[tool.ruff.format] -# Format SQL code in strings/docstrings -docstring-code-format = true -quote-style = "double" -indent-style = "space" - [tool.mypy] ignore_missing_imports = true no_strict_optional = true diff --git a/scripts/custom_formatter.py b/scripts/custom_formatter.py new file mode 100644 index 0000000..f8dcb5a --- /dev/null +++ b/scripts/custom_formatter.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import argparse +import ast +import sys +from pathlib import Path + + +class PythonFormatter: + + def __init__(self, source_code: str): + self.source_lines = source_code.splitlines() + self.tree = ast.parse(source_code) + self.node_parents = { + child: parent for parent in ast.walk(self.tree) for child in ast.iter_child_nodes(parent) + } + self.disabled_ranges = self._find_disabled_ranges() + + + def _find_disabled_ranges(self): + ranges = [] + in_disabled_block = False + start_line = 0 + for i, line in enumerate(self.source_lines): + if "# fmt: off" in line: + in_disabled_block = True + start_line = i + 1 + elif "# fmt: on" in line: + if in_disabled_block: + ranges.append((start_line, i + 1)) + in_disabled_block = False + return ranges + + + def _is_in_disabled_range(self, lineno): + for start, end in self.disabled_ranges: + if start <= lineno <= end: + return True + return False + + + def get_node_start_line(self, node): + if node.decorator_list: + return node.decorator_list[0].lineno + return node.lineno + + + def is_method(self, node) -> bool: + return isinstance(self.node_parents.get(node), ast.ClassDef) + + + def format(self) -> str: + nodes = {} + for node in ast.walk(self.tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + start_line = self.get_node_start_line(node) + nodes[start_line] = node + + lines = list(self.source_lines) + sorted_nodes = sorted(nodes.items(), key=lambda x: x[0], reverse=True) + + for lineno, node in sorted_nodes: + start_index = lineno - 1 + num_blank_lines = 0 + + # Skip formatting if node is inside a "fmt: off" block + if self._is_in_disabled_range(lineno): + continue + + if isinstance(node, ast.ClassDef): + num_blank_lines = 2 + elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + if self.is_method(node): + if node.name == "__init__": + num_blank_lines = 1 + else: + num_blank_lines = 2 + else: + num_blank_lines = 2 + + i = start_index - 1 + while i > 0 and not lines[i].strip(): + i -= 1 + + if i < 0: # start of file + i = -1 # will insert at 0 + + # For top-level nodes, we don't want to add spaces if it's the first thing in the file + # after imports. Let's check if there's anything but imports above. + is_truly_top_level = i == -1 + if not is_truly_top_level: + # Count existing blank lines + existing_blank_lines = 0 + for k in range(start_index - 1, i, -1): + if not lines[k].strip(): + existing_blank_lines += 1 + + # Only add lines if there are not enough + if existing_blank_lines < num_blank_lines: + # remove existing blank lines + del lines[i + 1 : start_index] + # insert new blank lines + for _ in range(num_blank_lines): + lines.insert(i + 1, "") + + result = "\n".join(line.rstrip() for line in lines) + if result: + result = result.strip() + "\n" + + return result + + +def main(): + parser = argparse.ArgumentParser(description="Python custom formatter.") + parser.add_argument("files", nargs="+", type=Path) + args = parser.parse_args() + + for path in args.files: + try: + source = path.read_text() + # Skip empty files + if not source.strip(): + continue + formatter = PythonFormatter(source) + formatted_source = formatter.format() + path.write_text(formatted_source) + print(f"Formatted {path}") + except Exception as e: + print(f"Could not format {path}: {e}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/scripts/ruff_check_format_assets.sh b/scripts/ruff_check_format_assets.sh index c141eeb..0962148 100755 --- a/scripts/ruff_check_format_assets.sh +++ b/scripts/ruff_check_format_assets.sh @@ -8,24 +8,23 @@ if [ ! -f "requirements.txt" ]; then exit 1 fi +# Check if pyproject.toml exists with ruff configuration +if [ ! -f "pyproject.toml" ]; then + echo "Error: pyproject.toml not found. Make sure it exists with proper ruff configuration" + exit 1 +fi + # Run ruff check with auto-fix first (including unsafe fixes for typing annotations) echo "Running ruff check with auto-fix..." ruff check src tests scripts --fix --unsafe-fixes --show-fixes -# Run ruff format +# Run ruff format with respect to project configuration echo "Running ruff format..." ruff format src tests scripts -# Fix SQL-specific whitespace issues after ruff (only trailing whitespace, avoid blank line removal) -echo "Fixing SQL trailing whitespace issues in BigQuery provider..." -if [[ "$OSTYPE" == "darwin"* ]]; then - # macOS - Only fix trailing whitespace after SQL keywords - find src/models -name "*.py" -type f -exec sed -i '' -E 's/([A-Z]+) +$/\1/g' {} \; -else - # Linux (CI environment) - Only fix trailing whitespace after SQL keywords - find src/models -name "*.py" -type f -exec sed -i -E 's/([A-Z]+) +$/\1/g' {} \; -fi -echo "SQL whitespace issues fixed!" +# Post-process files to ensure custom spacing rules are applied +echo "Applying custom spacing rules with custom formatter..." +find src tests scripts -name "*.py" -print0 | xargs -0 python3 scripts/custom_formatter.py # Show remaining issues (mainly line length issues that need manual intervention) echo -e "\n\nRemaining issues that need manual attention:" diff --git a/src/models/bigquery_data_access_provider.py b/src/models/bigquery_data_access_provider.py index fe0e99a..23d6d56 100644 --- a/src/models/bigquery_data_access_provider.py +++ b/src/models/bigquery_data_access_provider.py @@ -26,17 +26,13 @@ def __init__(self, project: str, location: str) -> None: bpd.options.bigquery.project = project bpd.options.display.progress_bar = None - @retry_with_backoff( - max_attempts=10, - min_wait=1, - max_wait=60, - exceptions=(ConnectionError, socket.timeout) - ) + + @retry_with_backoff(max_attempts=10, min_wait=1, max_wait=60, exceptions=(ConnectionError, socket.timeout)) def _read_gbq_dataframe(self, query: str) -> DataFrame: """ Execute a read query on Google BigQuery and return the results as a pandas DataFrame. Retries up to max_attempts times on connection errors with exponential backoff. - + Note: This method uses the bigframes.pandas.read_gbq function to execute the query. It relies on Application Default Credentials (ADC) for authentication, primarily using the @@ -57,6 +53,7 @@ def _read_gbq_dataframe(self, query: str) -> DataFrame: # Execute the query with retry logic return cast(DataFrame, bpd.read_gbq(query).to_pandas()) + def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> str: """ Construct an SQL query that calculates indexer eligibility: @@ -162,19 +159,20 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st total_good_days_online DESC, good_responses DESC """ + def fetch_indexer_issuance_eligibility_data(self, start_date: date, end_date: date) -> DataFrame: """ Fetch data from Google BigQuery, used to determine indexer issuance eligibility, and compute each indexer's issuance eligibility status. - + Depends on: - _get_indexer_eligibility_query() - _read_gbq_dataframe() - + Args: start_date (date): The start date for the data to fetch from BigQuery. end_date (date): The end date for the data to fetch from BigQuery. - + Returns: DataFrame: DataFrame containing a range of metrics for each indexer. The DataFrame contains the following columns: diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index 7b84340..0e5b2fb 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -24,7 +24,7 @@ class BlockchainClient: """Handles all blockchain interactions""" - + def __init__(self, rpc_providers: List[str], contract_address: str, project_root: Path): """ Initialize the blockchain client. @@ -47,7 +47,7 @@ def _load_contract_abi(self) -> List[Dict]: abi_path = self.project_root / "contracts" / "contract.abi.json" with open(abi_path) as f: return json.load(f) - + # If the ABI file cannot be loaded, raise an error except Exception as e: logger.error(f"Failed to load contract ABI: {str(e)}") @@ -78,7 +78,7 @@ def _get_working_web3_connection( provider_type = "primary" if i == 0 else f"backup #{i}" logger.info(f"Attempting to connect to {provider_type} RPC provider: {rpc_url}") w3 = Web3(Web3.HTTPProvider(rpc_url)) - + # Test connection if w3.is_connected(): logger.info(f"Successfully connected to {provider_type} RPC provider") @@ -87,21 +87,21 @@ def _get_working_web3_connection( address=Web3.to_checksum_address(contract_address), abi=contract_abi ) - # + # return w3, contract, rpc_url # If we could not connect log the error else: logger.warning(f"Could not connect to {provider_type} RPC provider: {rpc_url}") - + # If we get an error, log the error except Exception as e: provider_type = "primary" if i == 0 else f"backup #{i}" logger.warning(f"Error connecting to {provider_type} RPC provider {rpc_url}: {str(e)}") - + # If we get here, all providers failed raise ConnectionError(f"Failed to connect to any of {len(rpc_providers)} RPC providers: {rpc_providers}") - + def _setup_transaction_account(self, private_key: str, w3: Web3) -> str: """ @@ -118,7 +118,7 @@ def _setup_transaction_account(self, private_key: str, w3: Web3) -> str: account = w3.eth.account.from_key(private_key) logger.info(f"Using account: {account.address}") return account.address - + # If the account cannot be retrieved, log the error and raise an exception except Exception as e: logger.error(f"Failed to retrieve account from private key: {str(e)}") @@ -126,19 +126,18 @@ def _setup_transaction_account(self, private_key: str, w3: Web3) -> str: def _estimate_transaction_gas( - self, w3: Web3, contract_func: Any, indexer_addresses: List[str], - data_bytes: bytes, sender_address: str + self, w3: Web3, contract_func: Any, indexer_addresses: List[str], data_bytes: bytes, sender_address: str ) -> int: """ Estimate gas for the transaction with 25% buffer. - + Args: w3: Web3 instance contract_func: Contract function to call indexer_addresses: List of indexer addresses data_bytes: Data bytes for the transaction sender_address: Transaction sender address - + Returns: int: Estimated gas with 25% buffer """ @@ -158,12 +157,12 @@ def _estimate_transaction_gas( def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: bool) -> int: """ Determine the appropriate nonce for the transaction. - + Args: w3: Web3 instance sender_address: Transaction sender address replace: Whether to replace pending transactions - + Returns: int: Transaction nonce to use """ @@ -172,29 +171,28 @@ def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: b nonce = w3.eth.get_transaction_count(sender_address) logger.info(f"Using next available nonce: {nonce}") return nonce - + # If we are replacing a pending transaction, try to find and replace it logger.info("Attempting to find and replace a pending transaction") - + # Try to find pending transactions try: pending_txs = w3.eth.get_block("pending", full_transactions=True) sender_pending_txs = [ - tx for tx in pending_txs.transactions - if hasattr(tx, "from") and tx["from"] == sender_address + tx for tx in pending_txs.transactions if hasattr(tx, "from") and tx["from"] == sender_address ] - + # If we found pending transactions, use the nonce of the first pending transaction if sender_pending_txs: sender_pending_txs.sort(key=lambda x: x["nonce"]) nonce = sender_pending_txs[0]["nonce"] logger.info(f"Found pending transaction with nonce {nonce} for replacement") return nonce - + # If we could not find pending transactions log the issue except Exception as e: logger.warning(f"Could not check pending transactions: {str(e)}") - + # Check for nonce gaps try: current_nonce = w3.eth.get_transaction_count(sender_address, "pending") @@ -202,11 +200,11 @@ def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: b if current_nonce > latest_nonce: logger.info(f"Detected nonce gap: latest={latest_nonce}, pending={current_nonce}") return latest_nonce - + # If we could not check nonce gaps log the issue except Exception as e: logger.warning(f"Could not check nonce gap: {str(e)}") - + # Fallback to next available nonce nonce = w3.eth.get_transaction_count(sender_address) logger.info(f"Using next available nonce: {nonce}") @@ -220,22 +218,22 @@ def _get_gas_prices(self, w3: Web3, replace: bool) -> Tuple[int, int]: latest_block = w3.eth.get_block("latest") base_fee = latest_block["baseFeePerGas"] logger.info(f"Latest block base fee: {base_fee/1e9:.2f} gwei") - + # If the base fee cannot be retrieved, use a fallback value except Exception as e: logger.warning(f"Could not get base fee: {e}") base_fee = w3.to_wei(10, "gwei") - + # Try to get the max priority fee try: max_priority_fee = w3.eth.max_priority_fee logger.info(f"Max priority fee: {max_priority_fee/1e9:.2f} gwei") - + # If the max priority fee cannot be retrieved, use a fallback value except Exception as e: logger.warning(f"Could not get max priority fee: {e}") max_priority_fee = w3.to_wei(2, "gwei") # fallback - + # Return the base fee and max priority fee return base_fee, max_priority_fee @@ -268,14 +266,19 @@ def _build_transaction_params( tx_params["maxFeePerGas"] = max_fee_per_gas tx_params["maxPriorityFeePerGas"] = max_priority_fee_per_gas logger.info(f"Standard gas: {max_fee_per_gas/1e9:.2f} gwei") - + logger.info(f"Transaction parameters: nonce={nonce}, gas={gas_limit}, chain_id={chain_id}") return tx_params def _build_and_sign_transaction( - self, w3: Web3, contract_func: Any, indexer_addresses: List[str], - data_bytes: bytes, tx_params: Dict, private_key: str + self, + w3: Web3, + contract_func: Any, + indexer_addresses: List[str], + data_bytes: bytes, + tx_params: Dict, + private_key: str, ): """Build and sign the transaction.""" # Attempt to build the transaction @@ -297,7 +300,7 @@ def _build_and_sign_transaction( signed_tx = w3.eth.account.sign_transaction(transaction, private_key) logger.info("Transaction signed successfully") return signed_tx - + # If the transaction cannot be signed, log the error and raise an exception except Exception as e: logger.error(f"Failed to sign transaction: {e}") @@ -309,19 +312,19 @@ def _handle_transaction_error(self, error_msg: str) -> None: # If the error message contains "insufficient funds", log the error if "insufficient funds" in error_msg.lower(): logger.error("Insufficient funds to pay for gas") - + # If the error message contains "nonce too low", log the error elif "nonce too low" in error_msg.lower(): logger.error("Nonce is too low - transaction may have already been sent") - + # If the error message contains "nonce too high", log the error elif "nonce too high" in error_msg.lower(): logger.error("Nonce is too high - there may be pending transactions") - + # If the error message contains "gas", log the error elif "gas" in error_msg.lower(): logger.error("Gas-related issue - transaction may consume too much gas") - + # If the error message contains "400", log the error elif "400" in error_msg: logger.error("HTTP 400 Bad Request - RPC provider rejected the request") @@ -334,14 +337,14 @@ def _send_signed_transaction(self, w3: Web3, signed_tx: Any) -> str: tx_hash = w3.eth.send_raw_transaction(signed_tx.rawTransaction) logger.info(f"Transaction sent! Hash: {tx_hash.hex()}") return tx_hash.hex() - + # If the transaction could not be sent, log the error and raise an exception except ValueError as e: error_msg = str(e) logger.error(f"Transaction rejected by network: {error_msg}") self._handle_transaction_error(error_msg) raise - + # If we get an unexpected error, log the error and raise an exception except Exception as e: logger.error(f"Unexpected error sending transaction: {e}") @@ -383,20 +386,20 @@ def _build_and_send_transaction( try: # Get gas prices base_fee, max_priority_fee = self._get_gas_prices(w3, replace) - + # Build transaction parameters tx_params = self._build_transaction_params( sender_address, nonce, chain_id, gas_limit, base_fee, max_priority_fee, replace ) - + # Build and sign transaction signed_tx = self._build_and_sign_transaction( w3, contract_func, indexer_addresses, data_bytes, tx_params, private_key ) - + # Send transaction return self._send_signed_transaction(w3, signed_tx) - + # If we get an error, log the error and raise an exception except Exception as e: logger.error(f"Error in _build_and_send_transaction: {e}") @@ -406,12 +409,12 @@ def _build_and_send_transaction( def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Dict) -> str: """ Execute the complete transaction process using a single RPC connection. - + Args: w3: Web3 instance contract: Contract instance params: Dictionary containing all transaction parameters - + Returns: str: Transaction hash """ @@ -423,13 +426,13 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di sender_address = params["sender_address"] chain_id = params["chain_id"] replace = params["replace"] - + # Validate contract function exists if not hasattr(contract.functions, contract_function): raise ValueError(f"Contract {contract.address} does not have function: {contract_function}") - + contract_func = getattr(contract.functions, contract_function) - + # Log transaction details logger.info(f"Contract address: {contract.address}") logger.info(f"Contract function: {contract_function}") @@ -438,14 +441,16 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di logger.info(f"Chain ID: {chain_id}") logger.info(f"Sender address: {sender_address}") logger.info(f"Using RPC: {w3.provider.endpoint_uri}") - + # Check account balance balance_wei = w3.eth.get_balance(sender_address) balance_eth = w3.from_wei(balance_wei, "ether") logger.info(f"Account balance: {balance_eth} ETH") - + # All transaction steps with the same RPC connection - gas_limit = self._estimate_transaction_gas(w3, contract_func, indexer_addresses, data_bytes, sender_address) + gas_limit = self._estimate_transaction_gas( + w3, contract_func, indexer_addresses, data_bytes, sender_address + ) nonce = self._determine_transaction_nonce(w3, sender_address, replace) tx_hash = self._build_and_send_transaction( w3, @@ -459,19 +464,20 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di nonce, replace, ) - + # Wait for receipt with the same connection try: tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=30) if tx_receipt["status"] == 1: logger.info( - f"Transaction confirmed in block {tx_receipt['blockNumber']}, gas used: {tx_receipt['gasUsed']}" + f"Transaction confirmed in block {tx_receipt['blockNumber']}, " + f"gas used: {tx_receipt['gasUsed']}" ) else: logger.error(f"Transaction failed on-chain: {tx_hash}") except Exception as e: logger.warning(f"Could not get transaction receipt: {str(e)} (transaction may still be pending)") - + return tx_hash @@ -481,15 +487,15 @@ def _execute_transaction_with_rpc_failover( """ Execute a transaction operation with automatic RPC failover. This function tries each RPC provider in sequence until one succeeds. - + Args: operation_name: Human-readable name for the transaction operation operation_func: Function that takes (w3, contract, operation_params) and executes the operation operation_params: Parameters for the operation - + Returns: Result of the operation_func - + Raises: Exception: If all RPC providers fail """ @@ -503,7 +509,9 @@ def _execute_transaction_with_rpc_failover( logger.info(f"Attempting to do '{operation_name}' using RPC provider: {rpc_url}") # Get fresh connection for this rpc provider attempt - w3, contract, _ = self._get_working_web3_connection([rpc_url], self.contract_address, self.contract_abi) + w3, contract, _ = self._get_working_web3_connection( + [rpc_url], self.contract_address, self.contract_abi + ) # Execute the operation with this rpc provider and return the result return operation_func(w3, contract, operation_params) @@ -512,7 +520,7 @@ def _execute_transaction_with_rpc_failover( except Exception as e: logger.warning(f"{operation_name} failed with RPC provider {rpc_url}: {str(e)}") last_exception = e - + # If we get here, all providers failed logger.error(f"{operation_name} failed on all {len(self.rpc_providers)} RPC providers") raise last_exception or Exception(f"All RPC providers failed for {operation_name}") @@ -529,7 +537,7 @@ def send_transaction_to_allow_indexers( ) -> str: """ Send a transaction to allow a subset of indexers to claim issuance rewards. - + Args: indexer_addresses: List of indexer addresses to allow issuance private_key: Private key for transaction signing @@ -537,17 +545,17 @@ def send_transaction_to_allow_indexers( contract_function: Contract function name to call replace: Flag to replace pending transactions data_bytes: Optional bytes data to pass to contract function - + Returns: str: Transaction hash """ # Set up account temp_w3 = Web3() sender_address = self._setup_transaction_account(private_key, temp_w3) - + # Convert addresses to checksum format checksum_addresses = [Web3.to_checksum_address(addr) for addr in indexer_addresses] - + # Prepare all parameters for the transaction transaction_params = { "private_key": private_key, @@ -558,7 +566,7 @@ def send_transaction_to_allow_indexers( "chain_id": chain_id, "replace": replace, } - + # Execute the transaction with RPC failover try: return self._execute_transaction_with_rpc_failover( @@ -583,7 +591,7 @@ def batch_allow_indexers_issuance_eligibility( ) -> List[str]: """ Allow the issuance eligibility status of a list of indexers in batches. - + Args: indexer_addresses: List of indexer addresses to allow private_key: Private key for transaction signing @@ -592,7 +600,7 @@ def batch_allow_indexers_issuance_eligibility( replace: Optional flag to replace pending transactions batch_size: Optional batch size for processing large lists data_bytes: Optional bytes data to pass to contract function - + Returns: List[str]: List of transaction hashes from successful batches """ @@ -602,25 +610,25 @@ def batch_allow_indexers_issuance_eligibility( return [] if batch_size <= 0: raise ValueError("batch_size must be positive") - + # Calculate number of batches to process total_indexers = len(indexer_addresses) num_batches = (total_indexers + batch_size - 1) // batch_size logger.info(f"Processing {total_indexers} indexers in {num_batches} batch(es) of {batch_size}") - + try: tx_links = [] # Validate and format private key validated_private_key = validate_and_format_private_key(private_key) - + # Process each batch for i in range(num_batches): start_idx = i * batch_size end_idx = min(start_idx + batch_size, total_indexers) batch_indexers = indexer_addresses[start_idx:end_idx] - + logger.info(f"Processing batch {i+1}/{num_batches} with {len(batch_indexers)} indexers") - + # Try to send the transaction to the network (uses RPC failover) try: tx_hash = self.send_transaction_to_allow_indexers( @@ -633,7 +641,7 @@ def batch_allow_indexers_issuance_eligibility( ) tx_links.append(f"https://sepolia.arbiscan.io/tx/{tx_hash}") logger.info(f"Batch {i+1} transaction successful: {tx_hash}") - + # If we get an error, log the error and raise an exception except Exception as e: logger.error(f"Error processing batch {i+1} due to: {e}") @@ -647,4 +655,4 @@ def batch_allow_indexers_issuance_eligibility( except KeyValidationError as e: logger.error(f"Private key validation failed: {e}") - raise ValueError(f"Invalid private key: {e}") from e + raise ValueError(f"Invalid private key: {e}") from e diff --git a/src/models/data_processor.py b/src/models/data_processor.py index c561639..47c5b13 100644 --- a/src/models/data_processor.py +++ b/src/models/data_processor.py @@ -75,7 +75,7 @@ def export_bigquery_data_as_csvs_and_return_indexer_lists( def clean_old_date_directories(self, max_age_before_deletion: int) -> None: """ Remove old date directories to prevent unlimited growth. - + Args: max_age_before_deletion: Maximum age in days before deleting data output """ @@ -97,7 +97,7 @@ def clean_old_date_directories(self, max_age_before_deletion: int) -> None: # Try to parse the directory name as a date dir_date = datetime.strptime(item.name, "%Y-%m-%d").date() age_days = (today - dir_date).days - + # Remove if older than max_age_before_deletion if age_days > max_age_before_deletion: logger.info(f"Removing old data directory: {item} ({age_days} days old)") @@ -118,10 +118,10 @@ def clean_old_date_directories(self, max_age_before_deletion: int) -> None: def get_date_output_directory(self, current_date: date) -> Path: """ Get the output directory path for a specific date. - + Args: current_date: Date for which to get the output directory - + Returns: Path: Path to the date-specific output directory """ @@ -138,24 +138,24 @@ def ensure_output_directory_exists(self) -> None: def validate_dataframe_structure(self, df: pd.DataFrame, required_columns: List[str]) -> bool: """ Validate that a DataFrame has the required columns. - + Args: df: DataFrame to validate required_columns: List of required column names - + Returns: bool: True if all required columns are present - + Raises: ValueError: If required columns are missing """ # Check if any required columns are missing missing_columns = [col for col in required_columns if col not in df.columns] - + # If any required columns are missing, raise an error if missing_columns: raise ValueError(f"DataFrame missing required columns: {missing_columns}") - + # If all required columns are present, return True return True @@ -163,18 +163,18 @@ def validate_dataframe_structure(self, df: pd.DataFrame, required_columns: List[ def get_directory_size_info(self) -> dict: """ Get information about the output directory size and file counts. - + Returns: dict: Information about directory size and contents """ # If the directory doesn't exist, return a dictionary with 0 values if not self.output_dir.exists(): return {"exists": False, "total_size_bytes": 0, "directory_count": 0, "file_count": 0} - + total_size = 0 file_count = 0 directory_count = 0 - + # Get the total size of the directory and the number of files and directories for item in self.output_dir.rglob("*"): if item.is_file(): @@ -182,7 +182,7 @@ def get_directory_size_info(self) -> dict: file_count += 1 elif item.is_dir(): directory_count += 1 - + # Return the information about the directory size and contents return { "exists": True, diff --git a/src/models/issuance_data_access_helper.py b/src/models/issuance_data_access_helper.py index a124056..7e3a451 100644 --- a/src/models/issuance_data_access_helper.py +++ b/src/models/issuance_data_access_helper.py @@ -29,13 +29,13 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli ) -> List[str]: """ Main function to fetch and process data from BigQuery. - + Args: start_date: Start date for BigQuery data end_date: End date for BigQuery data current_date: Current date for output directory max_age_before_deletion: Maximum age in days before deleting old data - + Returns: List[str]: List of indexers that should be allowed issuance based on BigQuery data """ @@ -43,16 +43,15 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli config_manager = ConfigManager() config = config_manager.load_and_validate_config() project_root = config_manager.get_project_root() - + # Initialize bigquery provider bq_provider = BigQueryProvider( - project=str(config["bigquery_project_id"]), - location=str(config["bigquery_location"]) + project=str(config["bigquery_project_id"]), location=str(config["bigquery_location"]) ) # Initialize data processor data_processor = DataProcessor(project_root) - + try: # Fetch eligibility dataframe logger.info(f"Fetching eligibility data between {start_date} and {end_date}") @@ -60,10 +59,10 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli start_date, end_date ) logger.info(f"Retrieved issuance eligibility data for {len(indexer_issuance_eligibility_data)} indexers") - + # Get output directory for current date date_dir = data_processor.get_date_output_directory(current_date) - + # Export data and get indexer lists logger.info(f"Attempting to export indexer issuance eligibility lists to: {date_dir}") eligible_indexers, ineligible_indexers = ( @@ -72,21 +71,21 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli ) ) logger.info("Exported indexer issuance eligibility lists.") - + # Clean old eligibility lists logger.info("Cleaning old eligibility lists.") data_processor.clean_old_date_directories(max_age_before_deletion) - + # Log final summary logger.info(f"Processing complete. Output available at: {date_dir}") logger.info( - f"No. of eligible indexers to insert into smart contract on {date.today()} is: {len(eligible_indexers)}" + f"No. of eligible indexers to insert into smart contract on " + f"{date.today()} is: {len(eligible_indexers)}" ) - + # Return list of indexers that should be allowed issuance return eligible_indexers - + except Exception as e: logger.error(f"Transaction failed on all RPC providers: {str(e)}") raise - \ No newline at end of file diff --git a/src/models/scheduler.py b/src/models/scheduler.py index 54cdff5..37ee363 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -20,6 +20,7 @@ handlers=[logging.StreamHandler(sys.stdout)], ) logger = logging.getLogger("oracle-scheduler") + # Path to store last run info LAST_RUN_FILE = "/app/data/last_run.txt" HEALTHCHECK_FILE = "/app/healthcheck" @@ -81,7 +82,7 @@ def run_oracle(force_date=None): today = force_date or datetime.now().date() start_time = datetime.now() logger.info(f"Starting Service Quality Oracle run at {start_time} for date {today}") - + # Ensure we have valid google credentials before proceeding credential_manager.setup_google_credentials() @@ -209,24 +210,32 @@ def initialize(): # Set timezone for consistent scheduling timezone = pytz.timezone("UTC") logger.info(f"Using timezone: {timezone}") + # Schedule the job run_time = config["scheduled_run_time"] logger.info(f"Scheduling daily run at {run_time} UTC") schedule.every().day.at(run_time).do(run_oracle) + # Create initial healthcheck file update_healthcheck("Scheduler initialized") + # Run on startup if requested if os.environ.get("RUN_ON_STARTUP", "false").lower() == "true": logger.info("RUN_ON_STARTUP=true, executing oracle immediately") run_oracle() + else: # Check for missed runs logger.info("Checking for missed runs...") + if check_missed_runs(): logger.info("Executed missed run successfully") + else: logger.info("No missed runs to execute") + return config + except Exception as e: logger.error(f"Failed to initialize scheduler: {e}", exc_info=True) diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 48f96eb..550309e 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -59,9 +59,11 @@ def main(): try: # Attempt to load google bigquery data access credentials try: + # fmt: off import google.auth - _ = google.auth.default() + # fmt: on + # If credentials could not be loaded, set them up in memory via helper function using environment variables except Exception: credential_manager.setup_google_credentials() diff --git a/src/models/subgraph_data_access_provider.py b/src/models/subgraph_data_access_provider.py index fe344be..653d41c 100644 --- a/src/models/subgraph_data_access_provider.py +++ b/src/models/subgraph_data_access_provider.py @@ -35,18 +35,27 @@ def __init__(self): # Load configuration config = load_config() + # Get subgraph URL and API key from config self.subgraph_url = config.get("subgraph_url") self.api_key = config.get("studio_api_key") - # Validate configuration + + # If the subgraph URL is not set, raise an error if not self.subgraph_url: raise ValueError("SUBGRAPH_URL_PRODUCTION not set in configuration") + + # Log the initialized subgraph provider logger.info(f"Initialized SubgraphProvider with endpoint: {self.subgraph_url}") + + # If the API key is set, log a message if self.api_key: logger.info("API key loaded for subgraph queries") + + # If the API key is not set, log a warning else: logger.warning("No API key found, subgraph queries may be limited") + def fetch_all_indexers(self) -> list[dict[str, Any]]: """ Fetch all indexers that have been input into the subgraph. @@ -68,6 +77,7 @@ def fetch_all_indexers(self) -> list[dict[str, Any]]: logger.info(f"Fetched {len(all_indexers)} total indexers from subgraph") return all_indexers + def get_indexer_eligibility_statuses(self, first: int = 1000, skip: int = 0) -> list[dict[str, Any]]: """ Get eligibility statuses for all indexers. @@ -96,6 +106,7 @@ def get_indexer_eligibility_statuses(self, first: int = 1000, skip: int = 0) -> logger.error(f"Unexpected response format: {result}") return [] + def execute_query(self, query: str, variables: Optional[dict[str, Any]] = None) -> dict[str, Any]: """ Execute a GraphQL query against the subgraph. diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index 893bc37..bba2cf7 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -59,8 +59,9 @@ def _get_default_config_path(self) -> str: raise ConfigurationError("Could not find config.toml in project root or Docker container") - # TODO: check this... + + def _substitute_env_vars(self, config_toml: Any) -> Any: """ Recursively substitute environment variables in the config. @@ -115,15 +116,15 @@ def load_config(self) -> dict[str, Any]: # Load the TOML configuration with open(self.config_path, "rb") as f: config = tomllib.load(f) - + logger.info(f"Loaded configuration from: {self.config_path}") - + # Substitute environment variables throughout the configuration config = self._substitute_env_vars(config) - + logger.info("Successfully loaded configuration with environment variables") return config - + except FileNotFoundError as e: raise ConfigurationError(f"Configuration not found: {self.config_path}") from e except ConfigurationError: @@ -178,9 +179,11 @@ def _collect_missing_env_vars(self, obj: Any) -> list[str]: for var in env_vars: if os.getenv(var) is None: missing_vars.append(var) + elif isinstance(obj, dict): for value in obj.values(): missing_vars.extend(self._collect_missing_env_vars(value)) + elif isinstance(obj, list): for item in obj: missing_vars.extend(self._collect_missing_env_vars(item)) @@ -198,35 +201,37 @@ def get_flat_config(self) -> dict[str, Any]: """ config = self.load_config() + # fmt: off # Convert nested structure to flat format flat_config = { # BigQuery settings "bigquery_location": config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), "bigquery_project_id": config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), "bigquery_dataset_id": config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), - + # Blockchain settings "contract_address": config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), "contract_function": config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), "chain_id": config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), "rpc_providers": self._parse_rpc_urls(config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), - + # Scheduling "scheduled_run_time": config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), - + # Subgraph URLs "subgraph_url": config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), - + # Processing settings "batch_size": config.get("processing", {}).get("BATCH_SIZE", 125), "max_age_before_deletion": config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), - + # Secrets "google_application_credentials": config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), "private_key": config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), "studio_api_key": config.get("secrets", {}).get("STUDIO_API_KEY"), "slack_webhook_url": config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), } + # fmt: on return flat_config @@ -246,7 +251,6 @@ def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: return valid_providers -# Convenience function for easy integration with existing code def load_config() -> dict[str, Any]: """ Convenience function to load configuration. @@ -263,7 +267,6 @@ def load_config() -> dict[str, Any]: return loader.get_flat_config() -# For startup validation def validate_all_required_env_vars() -> None: """ Validate that all required environment variables are set. diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index 828c2e4..8c049e8 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -17,10 +17,10 @@ class ConfigManager: """Centralized configuration manager with validation and credential handling.""" - + def __init__(self): self._config = None - + def _validate_required_fields(self, data: dict, required_fields: list[str], context: str) -> None: """ @@ -67,20 +67,22 @@ def load_and_validate_config(self) -> dict[str, Any]: # If the configuration has already been loaded, return it if self._config is not None: return self._config - + try: # Load configuration using config loader loader = ConfigLoader() config = loader.get_flat_config() logger.info("Successfully loaded configuration") - + # Validate and convert chain_id to integer if config.get("chain_id"): try: config["chain_id"] = int(config["chain_id"]) except ValueError as e: - raise ValueError(f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer.") from e - + raise ValueError( + f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer." + ) from e + # Validate scheduled run time format (HH:MM) if config.get("scheduled_run_time"): try: @@ -90,7 +92,7 @@ def load_and_validate_config(self) -> dict[str, Any]: f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - " "must be in HH:MM format" ) from e - + # Validate blockchain configuration contains all required fields required_fields = [ "private_key", @@ -100,11 +102,11 @@ def load_and_validate_config(self) -> dict[str, Any]: "scheduled_run_time", ] self._validate_required_fields(config, required_fields, "Missing required blockchain configuration") - + # Validate RPC providers if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): raise ValueError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") - + # Set the configuration in the class & return it self._config = config return config @@ -125,7 +127,7 @@ def get_project_root() -> Path: docker_path = Path("/app") if docker_path.exists(): return docker_path - + # If the /app directory doesn't exist fall back to marker files current_path = Path(__file__).parent while current_path != current_path.parent: @@ -134,14 +136,14 @@ def get_project_root() -> Path: return current_path # Attempt to traverse upwards (will not work if the directory has no parent) current_path = current_path.parent - + # If we got here, something is wrong raise FileNotFoundError("Could not find project root directory. Investigate.") class CredentialManager: """Handles credential management for Google Cloud services.""" - + def __init__(self): pass @@ -184,20 +186,20 @@ def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: # Parse the credentials creds_data = json.loads(creds_env) cred_type = creds_data.get("type", "") - + # Validate the credentials data based on the type if cred_type == "authorized_user": required_fields = ["client_id", "client_secret", "refresh_token"] self._validate_required_fields( creds_data, required_fields, "Incomplete authorized_user credentials" ) - + elif cred_type == "service_account": required_fields = ["private_key", "client_email", "project_id"] self._validate_required_fields( creds_data, required_fields, "Incomplete service_account credentials" ) - + else: raise ValueError( f"Unsupported credential type: '{cred_type}'. Expected 'authorized_user' or 'service_account'" @@ -207,7 +209,7 @@ def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: except Exception as e: logger.error(f"Failed to parse and validate credentials JSON: {e}") raise ValueError(f"Invalid credentials JSON: {e}") from e - + # Return the parsed credentials return creds_data @@ -230,7 +232,7 @@ def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: # Set credentials globally for GCP libraries google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] logger.info("Successfully loaded user account credentials from environment variable") - + # Clear credentials from memory finally: if "creds_data" in locals(): @@ -245,19 +247,17 @@ def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None # Try to set up the credentials try: # Create credentials object directly from dict - credentials = service_account.Credentials.from_service_account_info( - creds_data - ) + credentials = service_account.Credentials.from_service_account_info(creds_data) # Set credentials globally for GCP libraries google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] logger.info("Successfully loaded service account credentials from environment variable") - + # If the credentials creation fails, raise an error except Exception as e: logger.error(f"Failed to create service account credentials: {e}") raise ValueError(f"Invalid service account credentials: {e}") from e - + # Clear the original credentials dict from memory if it exists finally: if "creds_data" in locals(): @@ -275,14 +275,14 @@ def setup_google_credentials(self) -> None: """ # Get the account credentials from the environment variable creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") - + # If the credentials are not set, log a warning and return if not creds_env: logger.warning( "GOOGLE_APPLICATION_CREDENTIALS not set. Falling back to gcloud CLI user credentials if available" ) return - + # Case 1: JSON credentials provided inline if creds_env.strip().startswith("{"): creds_data = None @@ -290,28 +290,29 @@ def setup_google_credentials(self) -> None: # Parse and validate credentials creds_data = self._parse_and_validate_credentials_json(creds_env) cred_type = creds_data.get("type") - + # Set up credentials based on type if cred_type == "authorized_user": self._setup_user_credentials_in_memory(creds_data.copy()) elif cred_type == "service_account": self._setup_service_account_credentials_in_memory(creds_data.copy()) - + except Exception as e: logger.error("Failed to set up credentials from environment variable") raise ValueError(f"Error processing inline credentials: {e}") from e finally: if creds_data is not None: creds_data.clear() - + # Case 2: File path provided elif os.path.exists(creds_env): logger.info(f"Using credentials file: {creds_env}") - + # Case 3: Invalid format else: logger.warning( - f"GOOGLE_APPLICATION_CREDENTIALS appears to be neither valid JSON nor existing file path: {creds_env[:50]}..." + f"GOOGLE_APPLICATION_CREDENTIALS appears to be neither valid JSON " + f"nor existing file path: {creds_env[:50]}..." ) logger.warning("Falling back to gcloud CLI authentication if available") @@ -319,25 +320,26 @@ def setup_google_credentials(self) -> None: def validate_google_credentials(self) -> bool: """ Validate that Google credentials are properly configured and working. - + Returns: bool: True if credentials are valid and working """ # Try to validate the credentials try: import google.auth + credentials, project = google.auth.default() - + # If the credentials are valid, log the success and return True if credentials: logger.info(f"Google credentials validated successfully for project: {project}") return True - + # If the credentials are not valid, log the error and return False else: logger.error("No valid Google credentials found") return False - + # If the credentials could not be validated log the error except Exception as e: logger.error(f"Google credentials validation failed: {e}") @@ -346,4 +348,4 @@ def validate_google_credentials(self) -> bool: # Global instances for easy access config_manager = ConfigManager() -credential_manager = CredentialManager() \ No newline at end of file +credential_manager = CredentialManager() diff --git a/src/utils/retry_decorator.py b/src/utils/retry_decorator.py index b5a4887..a3eff8f 100644 --- a/src/utils/retry_decorator.py +++ b/src/utils/retry_decorator.py @@ -7,16 +7,17 @@ from typing import Any, Callable, Type, Union from tenacity import ( + before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential, - before_sleep_log, ) logger = logging.getLogger(__name__) +# fmt: off def retry_with_backoff( max_attempts: int = 5, min_wait: int = 1, @@ -27,7 +28,7 @@ def retry_with_backoff( ) -> Callable: """ Retry decorator with exponential backoff. - + Args: max_attempts: Maximum number of retry attempts (default: 5) min_wait: Minimum wait time between retries in seconds (default: 1) @@ -35,7 +36,7 @@ def retry_with_backoff( multiplier: Exponential backoff multiplier (default: 2) exceptions: Exception types to retry on (default: Exception) reraise: Whether to reraise the exception after all attempts fail (default: True) - + Returns: Decorated function with retry logic """ @@ -51,7 +52,9 @@ def decorator(func: Callable) -> Callable: @wraps(func) def wrapper(*args: Any, **kwargs: Any) -> Any: return func(*args, **kwargs) - + return wrapper - + + return decorator +# fmt: on diff --git a/src/utils/slack_notifier.py b/src/utils/slack_notifier.py index a4df270..445a601 100644 --- a/src/utils/slack_notifier.py +++ b/src/utils/slack_notifier.py @@ -30,10 +30,7 @@ def __init__(self, webhook_url: str) -> None: @retry_with_backoff( - max_attempts=8, - min_wait=1, - max_wait=128, - exceptions=(requests.exceptions.RequestException,) + max_attempts=8, min_wait=1, max_wait=128, exceptions=(requests.exceptions.RequestException,) ) def _send_message(self, payload: Dict) -> bool: """ @@ -76,7 +73,6 @@ def _send_message(self, payload: Dict) -> bool: raise - def _create_payload(self, text: str, fields: List[Dict], color: str = "good") -> Dict: """Create a Slack message payload.""" return { From 78c0b20351e350e0f44342496e0574489b66cd21 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 11:57:19 +0200 Subject: [PATCH 24/48] audit cp --- src/models/blockchain_client.py | 4 +- src/models/issuance_data_access_helper.py | 2 +- src/models/scheduler.py | 68 ++++--------- src/models/service_quality_oracle.py | 115 +++++++++------------- src/utils/config_manager.py | 1 - src/utils/slack_notifier.py | 18 +--- 6 files changed, 69 insertions(+), 139 deletions(-) diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index 0e5b2fb..76468a2 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -585,8 +585,8 @@ def batch_allow_indexers_issuance_eligibility( private_key: str, chain_id: int, contract_function: str, + batch_size: int, replace: bool = False, - batch_size: int = 125, data_bytes: bytes = b"", ) -> List[str]: """ @@ -597,8 +597,8 @@ def batch_allow_indexers_issuance_eligibility( private_key: Private key for transaction signing chain_id: Chain ID of the target blockchain contract_function: Contract function name to call - replace: Optional flag to replace pending transactions batch_size: Optional batch size for processing large lists + replace: Optional flag to replace pending transactions data_bytes: Optional bytes data to pass to contract function Returns: diff --git a/src/models/issuance_data_access_helper.py b/src/models/issuance_data_access_helper.py index 7e3a451..0f9a70d 100644 --- a/src/models/issuance_data_access_helper.py +++ b/src/models/issuance_data_access_helper.py @@ -87,5 +87,5 @@ def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eli return eligible_indexers except Exception as e: - logger.error(f"Transaction failed on all RPC providers: {str(e)}") + logger.error(f"Failed to fetch and process BigQuery data: {str(e)}") raise diff --git a/src/models/scheduler.py b/src/models/scheduler.py index 37ee363..6c2a5d6 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -78,61 +78,22 @@ def run_oracle(force_date=None): Args: force_date: If provided, override the date for this run """ - global slack_notifier today = force_date or datetime.now().date() start_time = datetime.now() logger.info(f"Starting Service Quality Oracle run at {start_time} for date {today}") - # Ensure we have valid google credentials before proceeding - credential_manager.setup_google_credentials() - - # Attempt to run the oracle - try: - # Load latest configuration using config loader - load_config() - - # Run the oracle - oracle.main() - - # Record successful run and overwrite the last run date - save_last_run_date(today) - end_time = datetime.now() - duration_in_seconds = (end_time - start_time).total_seconds() - success_message = f"Run completed successfully for {today}. Duration: {duration_in_seconds:.2f}s" - logger.info(f"Service Quality Oracle {success_message}") - - # Touch healthcheck file to indicate successful runs - update_healthcheck(success_message) - - # Send success notification from scheduler - if slack_notifier: - slack_notifier.send_success_notification( - message=f"Run completed successfully for {today}. Duration: {duration_in_seconds:.2f}s", - title="Scheduled Run Success", - ) - - # Return True to indicate success - return True - - # If there is an error when trying to run the oracle, log the error and raise an exception - except Exception as e: - error_message = f"Run failed due to: {str(e)}" - logger.error(error_message, exc_info=True) - - # Update healthcheck file to indicate failure - update_healthcheck(f"ERROR: {error_message}") - - # Send failure notification to slack - if slack_notifier: - duration = (datetime.now() - start_time).total_seconds() - slack_notifier.send_failure_notification( - error_message=str(e), - stage="Scheduled Run" if force_date is None else f"Missed Run ({force_date})", - execution_time=duration, - ) + # The oracle.main() function handles its own exceptions, notifications, and credential setup. + # The scheduler's role is simply to trigger it and handle the retry logic. + oracle.main() - # Raise an exception to indicate failure - raise + # If oracle.main() completes without sys.exit, it was successful. + # Record successful run and update healthcheck. + save_last_run_date(today) + end_time = datetime.now() + duration_in_seconds = (end_time - start_time).total_seconds() + success_message = f"Scheduler successfully triggered oracle run for {today}. Duration: {duration_in_seconds:.2f}s" + logger.info(success_message) + update_healthcheck(success_message) def check_missed_runs(): @@ -168,6 +129,13 @@ def check_missed_runs(): return True except Exception as e: logger.error(f"Failed to execute missed run for {yesterday}: {e}") + # The oracle.main() has already sent a detailed failure notification. + # We can send an additional, scheduler-specific notification if desired. + if slack_notifier: + slack_notifier.send_failure_notification( + error_message=f"The missed run for {yesterday} failed. See previous error for details.", + stage="Scheduler Missed Run Execution" + ) return False return False diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 550309e..a4a86d1 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -42,6 +42,7 @@ def main(): """ start_time = time.time() slack_notifier = None + stage = "Initialization" try: # Load configuration to get Slack webhook and other settings @@ -52,91 +53,65 @@ def main(): else: logger.info("Slack notifications disabled (no webhook URL configured)") - except Exception as e: - logger.error(f"Failed to load configuration: {str(e)}") - sys.exit(1) - - try: # Attempt to load google bigquery data access credentials + stage = "Authentication" try: # fmt: off import google.auth _ = google.auth.default() # fmt: on - - # If credentials could not be loaded, set them up in memory via helper function using environment variables except Exception: credential_manager.setup_google_credentials() - try: - # Fetch + save indexer eligibility data and return eligible list as 'eligible_indexers' array - eligible_indexers = ( - bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers( - start_date=date.today() - timedelta(days=28), - end_date=date.today(), - current_date=date.today(), - max_age_before_deletion=config.get("MAX_AGE_BEFORE_DELETION"), - ) - ) - - logger.info(f"Found {len(eligible_indexers)} eligible indexers.") - - # Send eligible indexers to the blockchain contract - try: - transaction_links = batch_allow_indexers_issuance_eligibility_smart_contract( - eligible_indexers, replace=True, batch_size=config.get("BATCH_SIZE"), data_bytes=b"" - ) - - # Calculate execution time and send success notification - execution_time = time.time() - start_time - logger.info(f"Oracle run completed successfully in {execution_time:.2f} seconds") - - if slack_notifier: - # Calculate batch information for notification - batch_count = len(transaction_links) if transaction_links else 0 - total_processed = len(eligible_indexers) - - slack_notifier.send_success_notification( - eligible_indexers=eligible_indexers, - total_processed=total_processed, - execution_time=execution_time, - transaction_links=transaction_links, - batch_count=batch_count, - ) - - except Exception as e: - execution_time = time.time() - start_time - error_msg = f"Failed to allow indexers to claim issuance because: {str(e)}" - logger.error(error_msg) - - if slack_notifier: - slack_notifier.send_failure_notification( - error_message=str(e), stage="Blockchain Submission", execution_time=execution_time - ) - - sys.exit(1) - - except Exception as e: - execution_time = time.time() - start_time - error_msg = f"Failed to process indexer issuance eligibility data because: {str(e)}" - logger.error(error_msg) - - if slack_notifier: - slack_notifier.send_failure_notification( - error_message=str(e), stage="Data Processing", execution_time=execution_time - ) + # Fetch + save indexer eligibility data and return eligible list + stage = "Data Processing" + eligible_indexers = bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers( + start_date=date.today() - timedelta(days=28), + end_date=date.today(), + current_date=date.today(), + max_age_before_deletion=config.get("MAX_AGE_BEFORE_DELETION"), + ) + logger.info(f"Found {len(eligible_indexers)} eligible indexers.") + + # Send eligible indexers to the blockchain contract + stage = "Blockchain Submission" + transaction_links = batch_allow_indexers_issuance_eligibility_smart_contract( + eligible_indexers, + private_key=config["private_key"], + chain_id=config["chain_id"], + contract_function=config["contract_function"], + batch_size=config.get("BATCH_SIZE"), + replace=True, + data_bytes=b"", + ) + + # Calculate execution time and send success notification + execution_time = time.time() - start_time + logger.info(f"Oracle run completed successfully in {execution_time:.2f} seconds") - sys.exit(1) + if slack_notifier: + batch_count = len(transaction_links) if transaction_links else 0 + total_processed = len(eligible_indexers) + slack_notifier.send_success_notification( + eligible_indexers=eligible_indexers, + total_processed=total_processed, + execution_time=execution_time, + transaction_links=transaction_links, + batch_count=batch_count, + ) except Exception as e: execution_time = time.time() - start_time - error_msg = f"Oracle initialization or authentication failed: {str(e)}" - logger.error(error_msg) + error_msg = f"Oracle failed at stage '{stage}': {str(e)}" + logger.error(error_msg, exc_info=True) if slack_notifier: - slack_notifier.send_failure_notification( - error_message=str(e), stage="Initialization", execution_time=execution_time - ) + try: + slack_notifier.send_failure_notification( + error_message=str(e), stage=stage, execution_time=execution_time + ) + except Exception as slack_e: + logger.error(f"Failed to send Slack failure notification: {slack_e}", exc_info=True) sys.exit(1) diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index 8c049e8..9622660 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -264,7 +264,6 @@ def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None creds_data.clear() - @retry_with_backoff(max_attempts=3, exceptions=(ValueError,)) def setup_google_credentials(self) -> None: """ Set up Google credentials directly in memory from environment variable. diff --git a/src/utils/slack_notifier.py b/src/utils/slack_notifier.py index 445a601..ff0ef9a 100644 --- a/src/utils/slack_notifier.py +++ b/src/utils/slack_notifier.py @@ -136,11 +136,7 @@ def send_success_notification( payload = self._create_payload("Service Quality Oracle - Success", fields, "good") # Send message payload to Slack - try: - return self._send_message(payload) - except Exception as e: - logger.error(f"Failed to send success notification: {e}") - return False + return self._send_message(payload) def send_failure_notification( @@ -194,11 +190,7 @@ def send_failure_notification( payload = self._create_payload("Service Quality Oracle - FAILURE", fields, "danger") # Send message payload to Slack - try: - return self._send_message(payload) - except Exception as e: - logger.error(f"Failed to send failure notification: {e}") - return False + return self._send_message(payload) def send_info_notification(self, message: str, title: str = "Info") -> bool: @@ -224,11 +216,7 @@ def send_info_notification(self, message: str, title: str = "Info") -> bool: payload = self._create_payload(f"Service Quality Oracle - {title}", fields) # Send message payload to Slack - try: - return self._send_message(payload) - except Exception as e: - logger.error(f"Failed to send info notification: {e}") - return False + return self._send_message(payload) def create_slack_notifier(webhook_url: Optional[str]) -> Optional[SlackNotifier]: From aa310b8c37d37cca775bd14973f298c41b1b4b13 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 12:02:25 +0200 Subject: [PATCH 25/48] audit cp2 --- src/models/issuance_data_access_helper.py | 91 ----------------------- src/models/service_quality_oracle.py | 16 ++-- 2 files changed, 7 insertions(+), 100 deletions(-) delete mode 100644 src/models/issuance_data_access_helper.py diff --git a/src/models/issuance_data_access_helper.py b/src/models/issuance_data_access_helper.py deleted file mode 100644 index 0f9a70d..0000000 --- a/src/models/issuance_data_access_helper.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -Helper module containing high-level functions for the Service Quality Oracle. - -This module focuses on: -- Main data processing -- BigQuery data fetching and processing -- Integration between different components -""" - -import logging -from datetime import date -from typing import List - -from tenacity import retry, stop_after_attempt, wait_exponential - -from src.models.bigquery_data_access_provider import BigQueryProvider -from src.models.data_processor import DataProcessor -from src.utils.config_manager import ConfigManager - -logger = logging.getLogger(__name__) - - -@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=30, max=120), reraise=True) -def bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers( - start_date: date, - end_date: date, - current_date: date, - max_age_before_deletion: int, -) -> List[str]: - """ - Main function to fetch and process data from BigQuery. - - Args: - start_date: Start date for BigQuery data - end_date: End date for BigQuery data - current_date: Current date for output directory - max_age_before_deletion: Maximum age in days before deleting old data - - Returns: - List[str]: List of indexers that should be allowed issuance based on BigQuery data - """ - # Load configuration - config_manager = ConfigManager() - config = config_manager.load_and_validate_config() - project_root = config_manager.get_project_root() - - # Initialize bigquery provider - bq_provider = BigQueryProvider( - project=str(config["bigquery_project_id"]), location=str(config["bigquery_location"]) - ) - - # Initialize data processor - data_processor = DataProcessor(project_root) - - try: - # Fetch eligibility dataframe - logger.info(f"Fetching eligibility data between {start_date} and {end_date}") - indexer_issuance_eligibility_data = bq_provider.fetch_indexer_issuance_eligibility_data( - start_date, end_date - ) - logger.info(f"Retrieved issuance eligibility data for {len(indexer_issuance_eligibility_data)} indexers") - - # Get output directory for current date - date_dir = data_processor.get_date_output_directory(current_date) - - # Export data and get indexer lists - logger.info(f"Attempting to export indexer issuance eligibility lists to: {date_dir}") - eligible_indexers, ineligible_indexers = ( - data_processor.export_bigquery_data_as_csvs_and_return_indexer_lists( - indexer_issuance_eligibility_data, date_dir - ) - ) - logger.info("Exported indexer issuance eligibility lists.") - - # Clean old eligibility lists - logger.info("Cleaning old eligibility lists.") - data_processor.clean_old_date_directories(max_age_before_deletion) - - # Log final summary - logger.info(f"Processing complete. Output available at: {date_dir}") - logger.info( - f"No. of eligible indexers to insert into smart contract on " - f"{date.today()} is: {len(eligible_indexers)}" - ) - - # Return list of indexers that should be allowed issuance - return eligible_indexers - - except Exception as e: - logger.error(f"Failed to fetch and process BigQuery data: {str(e)}") - raise diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index a4a86d1..52c446b 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -18,10 +18,8 @@ sys.path.insert(0, project_root) # Import data access utilities with absolute import -from src.models.issuance_data_access_helper import ( - batch_allow_indexers_issuance_eligibility_smart_contract, - bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers, -) +from src.models.blockchain_client import BlockchainClient +from src.models.data_processor import DataProcessor from src.utils.config_loader import load_config from src.utils.config_manager import credential_manager from src.utils.slack_notifier import create_slack_notifier @@ -65,24 +63,24 @@ def main(): # Fetch + save indexer eligibility data and return eligible list stage = "Data Processing" - eligible_indexers = bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers( + data_processor = DataProcessor(config) + eligible_indexers = data_processor.process_and_get_eligible_indexers( start_date=date.today() - timedelta(days=28), end_date=date.today(), current_date=date.today(), - max_age_before_deletion=config.get("MAX_AGE_BEFORE_DELETION"), ) logger.info(f"Found {len(eligible_indexers)} eligible indexers.") # Send eligible indexers to the blockchain contract stage = "Blockchain Submission" - transaction_links = batch_allow_indexers_issuance_eligibility_smart_contract( - eligible_indexers, + blockchain_client = BlockchainClient() + transaction_links = blockchain_client.batch_allow_indexers_issuance_eligibility( + indexer_addresses=eligible_indexers, private_key=config["private_key"], chain_id=config["chain_id"], contract_function=config["contract_function"], batch_size=config.get("BATCH_SIZE"), replace=True, - data_bytes=b"", ) # Calculate execution time and send success notification From e057fdcf7e9c19c7c7846db0d7a6a23ffc721208 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 12:19:27 +0200 Subject: [PATCH 26/48] audit cp3 --- Dockerfile | 26 ++++++++++++++++---------- docker-compose.yml | 12 ++++++++++-- src/models/scheduler.py | 8 ++------ 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Dockerfile b/Dockerfile index ff45f4a..669d65a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# Dockerfile for Service Quality Oracle +# Dockerfile to create a clean, lightweight Docker Image for the Service Quality Oracle # Use Python 3.9 slim as the base image for a lightweight container FROM python:3.9-slim @@ -10,7 +10,14 @@ LABEL description="Service Quality Oracle" \ # Set working directory WORKDIR /app -# Set environment variables + +""" +Setup enviroment variables: + 1. PYTHONDONTWRITEBYTECODE=1 - Prevent python from creating .pyc files + 2. PYTHONUNBUFFERED=1 - Send logs direct to console without buffering + 3. PYTHONPATH=/app - Add app directory to python import path + 4. TZ=UTC - Set timezone to UTC +""" ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PYTHONPATH=/app \ @@ -40,18 +47,17 @@ COPY contracts/ ./contracts/ COPY .gitignore ./ COPY pyproject.toml ./ -# Copy the scheduler to the root directory -COPY src/models/scheduler.py ./ - # Create healthcheck file RUN touch /app/healthcheck # Use Tini as entrypoint for proper signal handling ENTRYPOINT ["/usr/bin/tini", "--"] -# Add healthcheck to verify the service is running -HEALTHCHECK --interval=5m --timeout=30s --start-period=1m --retries=3 \ - CMD python -c "import os, time; assert os.path.exists('/app/healthcheck') and time.time() - os.path.getmtime('/app/healthcheck') < 3600, 'Healthcheck failed'" || exit 1 +# Add healthcheck to verify the service is running. +# The scheduler updates the healthcheck file every minute. +# We check every 2 minutes and assert the file was modified in the last 5 minutes (300s). +HEALTHCHECK --interval=2m --timeout=30s --start-period=1m --retries=3 \ + CMD python -c "import os, time; assert os.path.exists('/app/healthcheck') and time.time() - os.path.getmtime('/app/healthcheck') < 300, 'Healthcheck failed'" || exit 1 -# Run the scheduler -CMD ["python", "scheduler.py"] +# Run the scheduler as a module +CMD ["python", "-m", "src.models.scheduler"] diff --git a/docker-compose.yml b/docker-compose.yml index aa79723..b3e0605 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,14 @@ services: + # Service Quality Oracle container service-quality-oracle: + + # Build the image from the Dockerfile in the current directory build: . + + # Set the container name container_name: service-quality-oracle + + # Set the image name image: service-quality-oracle:latest volumes: @@ -16,9 +23,7 @@ services: - ./credentials.json:/app/credentials.json:ro environment: - - PYTHONPATH=/app - RUN_ON_STARTUP=true - - TZ=UTC # Setup enviroment variables # Environment variables go into process memory for this specific container only @@ -45,8 +50,10 @@ services: reservations: memory: 512M + # Restart policy restart: unless-stopped + # Healthcheck to ensure the container is running healthcheck: test: ["CMD", "python", "-c", "import os, time; assert os.path.exists('/app/healthcheck') and time.time() - os.path.getmtime('/app/healthcheck') < 3600, 'Healthcheck failed'"] interval: 5m @@ -54,6 +61,7 @@ services: retries: 3 start_period: 1m + # Prevent log files from growing indefinitely and consuming disk space logging: driver: "json-file" options: diff --git a/src/models/scheduler.py b/src/models/scheduler.py index 6c2a5d6..a686416 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -225,12 +225,8 @@ def initialize(): try: while True: schedule.run_pending() - # Update healthcheck file periodically (every 30 seconds) - if datetime.now().second % 30 == 0: - update_healthcheck("Scheduler heartbeat") - - # Sleep - time.sleep(15) + update_healthcheck("Scheduler heartbeat") + time.sleep(60) except KeyboardInterrupt: logger.info("Scheduler stopped by user") From bcc2507d5d54802dd639871b89c571b9677f1ff9 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 13:39:53 +0200 Subject: [PATCH 27/48] audit cp4 --- src/models/scheduler.py | 368 ++++++++++++--------------- src/models/service_quality_oracle.py | 30 ++- src/utils/config_loader.py | 30 +-- 3 files changed, 196 insertions(+), 232 deletions(-) diff --git a/src/models/scheduler.py b/src/models/scheduler.py index a686416..c20d5ef 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -25,225 +25,179 @@ LAST_RUN_FILE = "/app/data/last_run.txt" HEALTHCHECK_FILE = "/app/healthcheck" -# Create a global slack notifier instance -slack_notifier = None - -def get_last_run_date(): - """Get the date of the last successful run from a persistent file""" - if os.path.exists(LAST_RUN_FILE): +class Scheduler: + def __init__(self): + self.slack_notifier = None + self.config = self.initialize() + + def get_last_run_date(self): + """Get the date of the last successful run from a persistent file""" + if os.path.exists(LAST_RUN_FILE): + try: + with open(LAST_RUN_FILE) as f: + last_run_str = f.read().strip() + return datetime.strptime(last_run_str, "%Y-%m-%d").date() + except Exception as e: + logger.error(f"Error reading last run date: {e}") + return None + + def save_last_run_date(self, run_date): + """Save the date of the last successful run to a file that we continuously overwrite each time""" try: - with open(LAST_RUN_FILE) as f: - last_run_str = f.read().strip() - return datetime.strptime(last_run_str, "%Y-%m-%d").date() + os.makedirs(os.path.dirname(LAST_RUN_FILE), exist_ok=True) + with open(LAST_RUN_FILE, "w") as f: + f.write(run_date.strftime("%Y-%m-%d")) except Exception as e: - logger.error(f"Error reading last run date: {e}") - return None - - -def save_last_run_date(run_date): - """Save the date of the last successful run to a file that we continuously overwrite each time""" - try: - os.makedirs(os.path.dirname(LAST_RUN_FILE), exist_ok=True) - with open(LAST_RUN_FILE, "w") as f: - f.write(run_date.strftime("%Y-%m-%d")) - except Exception as e: - logger.error(f"Error saving last run date: {e}") - - -def update_healthcheck(message=None): - """Update the healthcheck file with current timestamp and optional message""" - try: - with open(HEALTHCHECK_FILE, "w") as f: - timestamp = datetime.now().isoformat() - f.write(f"Last update: {timestamp}") - if message: - f.write(f"\n{message}") - except Exception as e: - logger.warning(f"Failed to update healthcheck file: {e}") - - -@retry( - stop=stop_after_attempt(5), - wait=wait_exponential(multiplier=1, min=60, max=600), - retry=retry_if_exception_type(Exception), - before_sleep=lambda retry_state: logger.warning( - f"Retry attempt {retry_state.attempt_number} after error: {retry_state.outcome.exception()}" - ), -) -def run_oracle(force_date=None): - """ - Function to run the Service Quality Oracle - - Args: - force_date: If provided, override the date for this run - """ - today = force_date or datetime.now().date() - start_time = datetime.now() - logger.info(f"Starting Service Quality Oracle run at {start_time} for date {today}") - - # The oracle.main() function handles its own exceptions, notifications, and credential setup. - # The scheduler's role is simply to trigger it and handle the retry logic. - oracle.main() - - # If oracle.main() completes without sys.exit, it was successful. - # Record successful run and update healthcheck. - save_last_run_date(today) - end_time = datetime.now() - duration_in_seconds = (end_time - start_time).total_seconds() - success_message = f"Scheduler successfully triggered oracle run for {today}. Duration: {duration_in_seconds:.2f}s" - logger.info(success_message) - update_healthcheck(success_message) - - -def check_missed_runs(): - """Check if we missed any runs and execute them if needed""" - global slack_notifier - today = datetime.now().date() - last_run = get_last_run_date() - if last_run is None: - logger.info("No record of previous runs. Will run at next scheduled time.") - return False - if last_run < today - timedelta(days=1): - # We missed at least one day - missed_days = (today - last_run).days - 1 - logger.warning(f"Detected {missed_days} missed runs. Last run was on {last_run}.") - - # Send notification about missed runs - if slack_notifier: - message = ( - f"Detected {missed_days} missed oracle runs. " - f"Last successful run was on {last_run}. " - "Attempting to execute missed run for yesterday." - ) - slack_notifier.send_info_notification( - message=message, - title="Missed Runs Detected", - ) - - # Run for the missed day (just run for yesterday, not all missed days) - yesterday = today - timedelta(days=1) - logger.info(f"Executing missed run for {yesterday}") + logger.error(f"Error saving last run date: {e}") + + def update_healthcheck(self, message=None): + """Update the healthcheck file with current timestamp and optional message""" try: - run_oracle(force_date=yesterday) - return True + with open(HEALTHCHECK_FILE, "w") as f: + timestamp = datetime.now().isoformat() + f.write(f"Last update: {timestamp}") + if message: + f.write(f"\n{message}") except Exception as e: - logger.error(f"Failed to execute missed run for {yesterday}: {e}") - # The oracle.main() has already sent a detailed failure notification. - # We can send an additional, scheduler-specific notification if desired. - if slack_notifier: - slack_notifier.send_failure_notification( - error_message=f"The missed run for {yesterday} failed. See previous error for details.", - stage="Scheduler Missed Run Execution" + logger.warning(f"Failed to update healthcheck file: {e}") + + @retry( + stop=stop_after_attempt(5), + wait=wait_exponential(multiplier=1, min=60, max=600), + retry=retry_if_exception_type(Exception), + before_sleep=lambda retry_state: logger.warning( + f"Retry attempt {retry_state.attempt_number} after error: {retry_state.outcome.exception()}" + ), + ) + def run_oracle(self, run_date_override=None): + """ + Function to run the Service Quality Oracle + + Args: + run_date_override: If provided, override the date for this run + """ + run_date = run_date_override or datetime.now().date() + start_time = datetime.now() + logger.info(f"Starting Service Quality Oracle run at {start_time} for date {run_date}") + + # The oracle.main() function handles its own exceptions, notifications, and credential setup. + # The scheduler's role is simply to trigger it and handle the retry logic. + oracle.main(run_date_override=run_date) + + # If oracle.main() completes without sys.exit, it was successful. + # Record successful run and update healthcheck. + self.save_last_run_date(run_date) + end_time = datetime.now() + duration_in_seconds = (end_time - start_time).total_seconds() + success_message = f"Scheduler successfully triggered oracle run for {run_date}. Duration: {duration_in_seconds:.2f}s" + logger.info(success_message) + self.update_healthcheck(success_message) + + def check_missed_runs(self): + """Check if we missed any runs and execute them if needed""" + today = datetime.now().date() + last_run = self.get_last_run_date() + + if last_run is None: + logger.info("No record of previous runs. Will run at next scheduled time.") + return + + if last_run < today - timedelta(days=1): + missed_days = (today - last_run).days - 1 + logger.warning(f"Detected {missed_days} missed runs. Last run was on {last_run}.") + + if self.slack_notifier: + message = ( + f"Detected {missed_days} missed oracle runs. " + f"Last successful run was on {last_run}. " + "Attempting to execute missed run for yesterday." + ) + self.slack_notifier.send_info_notification( + message=message, + title="Missed Runs Detected", + ) + + yesterday = today - timedelta(days=1) + logger.info(f"Executing missed run for {yesterday}") + # The run_oracle method is decorated with @retry, so it will handle its own retries. + self.run_oracle(run_date_override=yesterday) + + def initialize(self): + """Initialize the scheduler and validate configuration""" + logger.info("Initializing scheduler...") + try: + from src.utils.config_loader import validate_all_required_env_vars + validate_all_required_env_vars() + + credential_manager.setup_google_credentials() + config = load_config() + + self.slack_notifier = create_slack_notifier(config.get("SLACK_WEBHOOK_URL")) + if self.slack_notifier: + logger.info("Slack notifications enabled for scheduler") + startup_message = ( + f"Service Quality Oracle scheduler started successfully.\n" + f"**Scheduled time:** {config['SCHEDULED_RUN_TIME']} UTC\n" + f"**Environment:** {os.environ.get('ENVIRONMENT', 'unknown')}" + ) + self.slack_notifier.send_info_notification( + message=startup_message, + title="Scheduler Started", ) - return False - return False - - -def initialize(): - """Initialize the scheduler and validate configuration""" - global slack_notifier - logger.info("Initializing scheduler...") - try: - # Early validation of required environment variables - from src.utils.config_loader import validate_all_required_env_vars - - logger.info("Validating required environment variables...") - validate_all_required_env_vars() - - # Validate credentials early to fail fast if there are issues - credential_manager.setup_google_credentials() - - # Load and validate configuration - config = load_config() - - # Initialize Slack notifications - slack_notifier = create_slack_notifier(config.get("slack_webhook_url")) - if slack_notifier: - logger.info("Slack notifications enabled for scheduler") - - # Send startup notification - startup_message = ( - f"Service Quality Oracle scheduler started successfully.\n" - f"**Scheduled time:** {config['scheduled_run_time']} UTC\n" - f"**Environment:** {os.environ.get('ENVIRONMENT', 'unknown')}" - ) - slack_notifier.send_info_notification( - message=startup_message, - title="Scheduler Started", - ) - else: - logger.info("Slack notifications disabled for scheduler") - - # Set timezone for consistent scheduling - timezone = pytz.timezone("UTC") - logger.info(f"Using timezone: {timezone}") - - # Schedule the job - run_time = config["scheduled_run_time"] - logger.info(f"Scheduling daily run at {run_time} UTC") - schedule.every().day.at(run_time).do(run_oracle) - - # Create initial healthcheck file - update_healthcheck("Scheduler initialized") - - # Run on startup if requested - if os.environ.get("RUN_ON_STARTUP", "false").lower() == "true": - logger.info("RUN_ON_STARTUP=true, executing oracle immediately") - run_oracle() - - else: - # Check for missed runs - logger.info("Checking for missed runs...") - - if check_missed_runs(): - logger.info("Executed missed run successfully") + else: + logger.info("Slack notifications disabled for scheduler") + + pytz.timezone("UTC") + run_time = config["SCHEDULED_RUN_TIME"] + logger.info(f"Scheduling daily run at {run_time} UTC") + schedule.every().day.at(run_time).do(self.run_oracle, run_date_override=None) + + self.update_healthcheck("Scheduler initialized") + if os.environ.get("RUN_ON_STARTUP", "false").lower() == "true": + logger.info("RUN_ON_STARTUP=true, executing oracle immediately") + self.run_oracle() else: - logger.info("No missed runs to execute") + # Check for missed runs + logger.info("Checking for missed runs...") + self.check_missed_runs() - return config + return config - except Exception as e: - logger.error(f"Failed to initialize scheduler: {e}", exc_info=True) + except Exception as e: + logger.error(f"Failed to initialize scheduler: {e}", exc_info=True) + if self.slack_notifier: + self.slack_notifier.send_failure_notification( + error_message=str(e), stage="Scheduler Initialization", execution_time=0 + ) + sys.exit(1) - # Try to send failure notification even if initialization failed - if slack_notifier: - slack_notifier.send_failure_notification( - error_message=str(e), stage="Scheduler Initialization", execution_time=0 - ) + def run(self): + """Main loop for the scheduler""" + logger.info("Scheduler started and waiting for scheduled runs") + try: + while True: + schedule.run_pending() + self.update_healthcheck("Scheduler heartbeat") + time.sleep(60) + + except KeyboardInterrupt: + logger.info("Scheduler stopped by user") + if self.slack_notifier: + self.slack_notifier.send_info_notification( + message="Scheduler stopped by user interrupt", title="Scheduler Stopped" + ) - sys.exit(1) + except Exception as e: + logger.error(f"Scheduler crashed: {e}", exc_info=True) + if self.slack_notifier: + self.slack_notifier.send_failure_notification( + error_message=str(e), stage="Scheduler Runtime", execution_time=0 + ) + sys.exit(1) if __name__ == "__main__": - # Initialize the scheduler - config = initialize() - logger.info("Scheduler started and waiting for scheduled runs") - - # Main loop - try: - while True: - schedule.run_pending() - update_healthcheck("Scheduler heartbeat") - time.sleep(60) - - except KeyboardInterrupt: - logger.info("Scheduler stopped by user") - - if slack_notifier: - slack_notifier.send_info_notification( - message="Scheduler stopped by user interrupt", title="Scheduler Stopped" - ) - - except Exception as e: - logger.error(f"Scheduler crashed: {e}", exc_info=True) - - # Send failure notification to slack - if slack_notifier: - slack_notifier.send_failure_notification( - error_message=str(e), stage="Scheduler Runtime", execution_time=0 - ) - - # Exit the scheduler - sys.exit(1) + scheduler = Scheduler() + scheduler.run() diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 52c446b..917846c 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -21,7 +21,7 @@ from src.models.blockchain_client import BlockchainClient from src.models.data_processor import DataProcessor from src.utils.config_loader import load_config -from src.utils.config_manager import credential_manager +from src.utils.config_manager import config_manager, credential_manager from src.utils.slack_notifier import create_slack_notifier # Set up basic logging @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) -def main(): +def main(run_date_override: date = None): """ Main entry point for the Service Quality Oracle. This function: @@ -37,6 +37,9 @@ def main(): 2. Fetches and processes indexer eligibility data 3. Submits eligible indexers to the blockchain 4. Sends Slack notifications about the run status + + Args: + run_date_override: If provided, use this date for the run instead of today. """ start_time = time.time() slack_notifier = None @@ -61,25 +64,32 @@ def main(): except Exception: credential_manager.setup_google_credentials() + # Define the date for the current run + current_run_date = run_date_override or date.today() + # Fetch + save indexer eligibility data and return eligible list stage = "Data Processing" data_processor = DataProcessor(config) eligible_indexers = data_processor.process_and_get_eligible_indexers( - start_date=date.today() - timedelta(days=28), - end_date=date.today(), - current_date=date.today(), + start_date=current_run_date - timedelta(days=28), + end_date=current_run_date, + current_date=current_run_date, ) logger.info(f"Found {len(eligible_indexers)} eligible indexers.") + + data_processor.clean_old_date_directories(config["MAX_AGE_BEFORE_DELETION"]) + - # Send eligible indexers to the blockchain contract + # --- Blockchain Submission Stage --- stage = "Blockchain Submission" + logger.info("Instantiating BlockchainClient...") blockchain_client = BlockchainClient() transaction_links = blockchain_client.batch_allow_indexers_issuance_eligibility( indexer_addresses=eligible_indexers, - private_key=config["private_key"], - chain_id=config["chain_id"], - contract_function=config["contract_function"], - batch_size=config.get("BATCH_SIZE"), + private_key=config["PRIVATE_KEY"], + chain_id=config["CHAIN_ID"], + contract_function=config["CONTRACT_FUNCTION"], + batch_size=config["BATCH_SIZE"], replace=True, ) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index bba2cf7..14ca28b 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -205,31 +205,31 @@ def get_flat_config(self) -> dict[str, Any]: # Convert nested structure to flat format flat_config = { # BigQuery settings - "bigquery_location": config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), - "bigquery_project_id": config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), - "bigquery_dataset_id": config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), + "BIGQUERY_LOCATION": config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), + "BIGQUERY_PROJECT_ID": config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), + "BIGQUERY_DATASET_ID": config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), # Blockchain settings - "contract_address": config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), - "contract_function": config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), - "chain_id": config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), - "rpc_providers": self._parse_rpc_urls(config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), + "CONTRACT_ADDRESS": config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), + "CONTRACT_FUNCTION": config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), + "CHAIN_ID": config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), + "RPC_PROVIDERS": self._parse_rpc_urls(config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), # Scheduling - "scheduled_run_time": config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), + "SCHEDULED_RUN_TIME": config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), # Subgraph URLs - "subgraph_url": config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), + "SUBGRAPH_URL": config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), # Processing settings - "batch_size": config.get("processing", {}).get("BATCH_SIZE", 125), - "max_age_before_deletion": config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), + "BATCH_SIZE": config.get("processing", {}).get("BATCH_SIZE", 125), + "MAX_AGE_BEFORE_DELETION": config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), # Secrets - "google_application_credentials": config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), - "private_key": config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), - "studio_api_key": config.get("secrets", {}).get("STUDIO_API_KEY"), - "slack_webhook_url": config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), + "GOOGLE_APPLICATION_CREDENTIALS": config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), + "PRIVATE_KEY": config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), + "STUDIO_API_KEY": config.get("secrets", {}).get("STUDIO_API_KEY"), + "SLACK_WEBHOOK_URL": config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), } # fmt: on From 2716d281f62e173642a7acee436b98e831134eeb Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 17:20:16 +0200 Subject: [PATCH 28/48] Never try to run the scheduler for more than 7 days at a time on restart. --- src/models/scheduler.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/models/scheduler.py b/src/models/scheduler.py index c20d5ef..037df47 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -32,15 +32,31 @@ def __init__(self): self.config = self.initialize() def get_last_run_date(self): - """Get the date of the last successful run from a persistent file""" + """ + Get the date of the last successful run from a persistent file. + If the last run is older than 7 days, cap it at 7 days ago to limit BigQuery costs. + """ + last_run_date = None if os.path.exists(LAST_RUN_FILE): try: with open(LAST_RUN_FILE) as f: last_run_str = f.read().strip() - return datetime.strptime(last_run_str, "%Y-%m-%d").date() + last_run_date = datetime.strptime(last_run_str, "%Y-%m-%d").date() except Exception as e: - logger.error(f"Error reading last run date: {e}") - return None + logger.error(f"Error reading or parsing last run date file: {e}") + return None + + today = datetime.now().date() + seven_days_ago = today - timedelta(days=7) + + if last_run_date and last_run_date < seven_days_ago: + logger.warning( + f"Last successful run was on {last_run_date}, which is more than 7 days ago. " + f"Capping backfill to 7 days to conserve BigQuery credits." + ) + return seven_days_ago + + return last_run_date def save_last_run_date(self, run_date): """Save the date of the last successful run to a file that we continuously overwrite each time""" From 5320a4f238853bf2a6fd3b3a45610afd44aece42 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 17:51:51 +0200 Subject: [PATCH 29/48] audit cp5 --- src/utils/config_loader.py | 47 ++++++++++++- src/utils/config_manager.py | 131 ------------------------------------ 2 files changed, 44 insertions(+), 134 deletions(-) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index 14ca28b..73c633f 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -17,6 +17,7 @@ import sys from pathlib import Path from typing import Any, Optional +from datetime import datetime # Handle Python version compatibility for TOML loading if sys.version_info >= (3, 11): @@ -251,9 +252,48 @@ def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: return valid_providers +def _validate_config(config: dict[str, Any]) -> dict[str, Any]: + """Helper function to validate the loaded configuration.""" + # Validate and convert chain_id to integer + if config.get("chain_id"): + try: + config["chain_id"] = int(config["chain_id"]) + except (ValueError, TypeError) as e: + raise ConfigurationError( + f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer." + ) from e + + # Validate scheduled run time format (HH:MM) + if config.get("scheduled_run_time"): + try: + datetime.strptime(config["scheduled_run_time"], "%H:%M") + except (ValueError, TypeError) as e: + raise ConfigurationError( + f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - must be in HH:MM format" + ) from e + + # Validate required fields + required_fields = [ + "private_key", + "contract_address", + "contract_function", + "chain_id", + "scheduled_run_time", + ] + missing_fields = [field for field in required_fields if not config.get(field)] + if missing_fields: + raise ConfigurationError(f"Missing required configuration fields: {', '.join(missing_fields)}") + + # Validate RPC providers + if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): + raise ConfigurationError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") + + return config + + def load_config() -> dict[str, Any]: """ - Convenience function to load configuration. + Convenience function to load and validate configuration. Returns configuration in flat format compatible with existing codebase. @@ -261,10 +301,11 @@ def load_config() -> dict[str, Any]: Dictionary containing configuration with secrets from environment variables Raises: - ConfigurationError: If configuration loading fails + ConfigurationError: If configuration loading or validation fails """ loader = ConfigLoader() - return loader.get_flat_config() + flat_config = loader.get_flat_config() + return _validate_config(flat_config) def validate_all_required_env_vars() -> None: diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py index 9622660..b40d280 100644 --- a/src/utils/config_manager.py +++ b/src/utils/config_manager.py @@ -5,142 +5,12 @@ import json import logging import os -from datetime import datetime from pathlib import Path from typing import Any -from src.utils.config_loader import ConfigLoader, ConfigurationError -from src.utils.retry_decorator import retry_with_backoff - logger = logging.getLogger(__name__) -class ConfigManager: - """Centralized configuration manager with validation and credential handling.""" - - def __init__(self): - self._config = None - - - def _validate_required_fields(self, data: dict, required_fields: list[str], context: str) -> None: - """ - Helper function to validate required fields are present in a dictionary. - - Args: - data: Dictionary to validate - required_fields: List of required fields - context: Context for error message - - Raises: - ValueError: If required fields are missing - """ - # Validate that all required fields are present in the data - missing_fields = [field for field in required_fields if field not in data] - - # If any required fields are missing, raise an error - if missing_fields: - raise ValueError(f"{context}: missing {missing_fields}") - - - def load_and_validate_config(self) -> dict[str, Any]: - """ - Load all necessary configurations using config loader, validate, and return them. - This function is called once at startup to load the configuration. - - Returns: - Dict[str, Any]: Config dictionary with validated and converted values. - { - "bigquery_project_id": str, - "bigquery_location": str, - "rpc_providers": list[str], - "contract_address": str, - "contract_function": str, - "chain_id": int, - "scheduled_run_time": str, - "batch_size": int, - "max_age_before_deletion": int, - } - Raises: - ConfigurationError: If configuration loading fails - ValueError: If configuration validation fails - """ - # If the configuration has already been loaded, return it - if self._config is not None: - return self._config - - try: - # Load configuration using config loader - loader = ConfigLoader() - config = loader.get_flat_config() - logger.info("Successfully loaded configuration") - - # Validate and convert chain_id to integer - if config.get("chain_id"): - try: - config["chain_id"] = int(config["chain_id"]) - except ValueError as e: - raise ValueError( - f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer." - ) from e - - # Validate scheduled run time format (HH:MM) - if config.get("scheduled_run_time"): - try: - datetime.strptime(config["scheduled_run_time"], "%H:%M") - except ValueError as e: - raise ValueError( - f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - " - "must be in HH:MM format" - ) from e - - # Validate blockchain configuration contains all required fields - required_fields = [ - "private_key", - "contract_address", - "contract_function", - "chain_id", - "scheduled_run_time", - ] - self._validate_required_fields(config, required_fields, "Missing required blockchain configuration") - - # Validate RPC providers - if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): - raise ValueError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") - - # Set the configuration in the class & return it - self._config = config - return config - - except ConfigurationError: - raise - except Exception as e: - raise ConfigurationError(f"Configuration validation failed: {e}") from e - - - @staticmethod - def get_project_root() -> Path: - """ - Get the path to the project root directory. - In Docker environments, use /app. Otherwise, find by marker files. - """ - # Use the /app directory as the project root if it exists - docker_path = Path("/app") - if docker_path.exists(): - return docker_path - - # If the /app directory doesn't exist fall back to marker files - current_path = Path(__file__).parent - while current_path != current_path.parent: - if (current_path / ".gitignore").exists() or (current_path / "pyproject.toml").exists(): - logger.info(f"Found project root at: {current_path}") - return current_path - # Attempt to traverse upwards (will not work if the directory has no parent) - current_path = current_path.parent - - # If we got here, something is wrong - raise FileNotFoundError("Could not find project root directory. Investigate.") - - class CredentialManager: """Handles credential management for Google Cloud services.""" @@ -346,5 +216,4 @@ def validate_google_credentials(self) -> bool: # Global instances for easy access -config_manager = ConfigManager() credential_manager = CredentialManager() From 1b60863817a87274122620235394bf45ebd24f62 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 17:53:58 +0200 Subject: [PATCH 30/48] ALL CAPS --- src/utils/config_loader.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index 73c633f..aa8b498 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -255,30 +255,30 @@ def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: def _validate_config(config: dict[str, Any]) -> dict[str, Any]: """Helper function to validate the loaded configuration.""" # Validate and convert chain_id to integer - if config.get("chain_id"): + if config.get("CHAIN_ID"): try: - config["chain_id"] = int(config["chain_id"]) + config["CHAIN_ID"] = int(config["CHAIN_ID"]) except (ValueError, TypeError) as e: raise ConfigurationError( - f"Invalid BLOCKCHAIN_CHAIN_ID: {config['chain_id']} - must be an integer." + f"Invalid BLOCKCHAIN_CHAIN_ID: {config['CHAIN_ID']} - must be an integer." ) from e # Validate scheduled run time format (HH:MM) - if config.get("scheduled_run_time"): + if config.get("SCHEDULED_RUN_TIME"): try: - datetime.strptime(config["scheduled_run_time"], "%H:%M") + datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") except (ValueError, TypeError) as e: raise ConfigurationError( - f"Invalid SCHEDULED_RUN_TIME format: {config['scheduled_run_time']} - must be in HH:MM format" + f"Invalid SCHEDULED_RUN_TIME format: {config['SCHEDULED_RUN_TIME']} - must be in HH:MM format" ) from e # Validate required fields required_fields = [ - "private_key", - "contract_address", - "contract_function", - "chain_id", - "scheduled_run_time", + "PRIVATE_KEY", + "CONTRACT_ADDRESS", + "CONTRACT_FUNCTION", + "CHAIN_ID", + "SCHEDULED_RUN_TIME", ] missing_fields = [field for field in required_fields if not config.get(field)] if missing_fields: From 2b2134d0d4148ba9ab0b64b4a72c769e598e0dc3 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 18:06:01 +0200 Subject: [PATCH 31/48] Update config_loader.py --- src/utils/config_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py index aa8b498..757c22f 100644 --- a/src/utils/config_loader.py +++ b/src/utils/config_loader.py @@ -285,7 +285,7 @@ def _validate_config(config: dict[str, Any]) -> dict[str, Any]: raise ConfigurationError(f"Missing required configuration fields: {', '.join(missing_fields)}") # Validate RPC providers - if not config.get("rpc_providers") or not isinstance(config["rpc_providers"], list): + if not config.get("RPC_PROVIDERS") or not isinstance(config["RPC_PROVIDERS"], list): raise ConfigurationError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") return config From f4c1f4d76f4ba1a000ba1bdf292b2e73802bfa14 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 18:06:05 +0200 Subject: [PATCH 32/48] Update service_quality_oracle.py --- src/models/service_quality_oracle.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 917846c..ac15422 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -21,7 +21,6 @@ from src.models.blockchain_client import BlockchainClient from src.models.data_processor import DataProcessor from src.utils.config_loader import load_config -from src.utils.config_manager import config_manager, credential_manager from src.utils.slack_notifier import create_slack_notifier # Set up basic logging @@ -54,16 +53,6 @@ def main(run_date_override: date = None): else: logger.info("Slack notifications disabled (no webhook URL configured)") - # Attempt to load google bigquery data access credentials - stage = "Authentication" - try: - # fmt: off - import google.auth - _ = google.auth.default() - # fmt: on - except Exception: - credential_manager.setup_google_credentials() - # Define the date for the current run current_run_date = run_date_override or date.today() From 7e12baa31f902ecc02d307cbde7efb8f96ec571f Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 18:25:58 +0200 Subject: [PATCH 33/48] Create configuration.py --- src/utils/configuration.py | 380 +++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 src/utils/configuration.py diff --git a/src/utils/configuration.py b/src/utils/configuration.py new file mode 100644 index 0000000..af358d7 --- /dev/null +++ b/src/utils/configuration.py @@ -0,0 +1,380 @@ +""" +Centralized configuration and credential management for the Service Quality Oracle. +""" + +import json +import logging +import os +import re +import sys +from datetime import datetime +from pathlib import Path +from typing import Any, Optional + +# Handle Python version compatibility for TOML loading +if sys.version_info >= (3, 11): + import tomllib +else: + import tomli as tomllib + +logger = logging.getLogger(__name__) + + +class ConfigurationError(Exception): + """Raised when configuration loading or validation fails.""" + pass + + +# --- Configuration Loading --- + +class ConfigLoader: + """Internal class to load configuration from TOML and environment variables.""" + + def __init__(self, config_path: Optional[str] = None): + """Initialize the config loader""" + self.config_path = config_path or self._get_default_config_path() + self._env_var_pattern = re.compile(r"\$([A-Z_][A-Z0-9_]*)") + + + def _get_default_config_path(self) -> str: + """Get the default configuration template path.""" + # Check if we're in a Docker container + docker_path = Path("/app/config.toml") + if docker_path.exists(): + return str(docker_path) + + # For local development, look in project root + current_path = Path(__file__).parent + while current_path != current_path.parent: + config_path = current_path / "config.toml" + if config_path.exists(): + return str(config_path) + current_path = current_path.parent + + raise ConfigurationError("Could not find config.toml in project root or Docker container") + + + def _substitute_env_vars(self, config_toml: Any) -> Any: + """ + Recursively substitute environment variables in the config. + + Supports $VARIABLE_NAME syntax for environment variable substitution. + + Args: + config_toml: config file to process + + Returns: + Processed config with environment variables substituted + + Raises: + ConfigurationError: If required environment variable is missing + """ + if isinstance(config_toml, str): + # Find all environment variable references + env_vars = self._env_var_pattern.findall(config_toml) + + + for env_var in env_vars: + env_value = os.getenv(env_var) + if env_value is None: + raise ConfigurationError(f"Required environment variable {env_var} is not set") + + # Replace the environment variable reference with actual value + config_toml = config_toml.replace(f"${env_var}", env_value) + + return config_toml + + elif isinstance(config_toml, dict): + return {k: self._substitute_env_vars(v) for k, v in config_toml.items()} + + elif isinstance(config_toml, list): + return [self._substitute_env_vars(item) for item in config_toml] + + return config_toml + + + def _get_raw_config(self) -> dict: + """ + Get raw configuration from TOML file. + + Returns: + toml file as a dictionary + """ + try: + with open(self.config_path, "rb") as f: + return tomllib.load(f) + + except FileNotFoundError as e: + raise ConfigurationError(f"Configuration not found: {self.config_path}") from e + + except Exception as e: + raise ConfigurationError(f"Failed to parse configuration: {e}") from e + + + def get_flat_config(self) -> dict[str, Any]: + """ + Get configuration in flat format. + + Returns: + Flat dictionary with all configuration values + """ + raw_config = self._get_raw_config() + substituted_config = self._substitute_env_vars(raw_config) + + # fmt: off + # Convert nested structure to flat format + return { + # BigQuery settings + "BIGQUERY_LOCATION": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), + "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), + "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), + + # Blockchain settings + "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), + "CONTRACT_FUNCTION": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), + "CHAIN_ID": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), + "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), + + # Scheduling + "SCHEDULED_RUN_TIME": substituted_config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), + + # Subgraph URLs + "SUBGRAPH_URL": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), + + # Processing settings + "BATCH_SIZE": substituted_config.get("processing", {}).get("BATCH_SIZE", 125), + "MAX_AGE_BEFORE_DELETION": substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), + + # Secrets + "GOOGLE_APPLICATION_CREDENTIALS": substituted_config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), + "PRIVATE_KEY": substituted_config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), + "STUDIO_API_KEY": substituted_config.get("secrets", {}).get("STUDIO_API_KEY"), + "SLACK_WEBHOOK_URL": substituted_config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), + } + # fmt: on + + + def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: + """Parse RPC URLs from list format.""" + if not rpc_urls or not isinstance(rpc_urls, list) or not all(isinstance(url, str) for url in rpc_urls): + raise ConfigurationError("BLOCKCHAIN_RPC_URLS must be a list of valid string providers") + + valid_providers = [url.strip() for url in rpc_urls if url.strip()] + if not valid_providers: + + raise ConfigurationError("No valid RPC providers found in BLOCKCHAIN_RPC_URLS") + + return valid_providers + + + def _collect_missing_env_vars(self, obj: Any) -> list[str]: + """ + Collect all missing environment variables from config object. + + Args: + obj: config object to collect missing environment variables from + + Returns: + list of missing environment variables (if any) + """ + missing = [] + # Collect the missing enviroment vaiables using the appropriate speicifc method + if isinstance(obj, str): + env_vars = self._env_var_pattern.findall(obj) + for var in env_vars: + if os.getenv(var) is None: + missing.append(var) + + elif isinstance(obj, dict): + for value in obj.values(): + missing.extend(self._collect_missing_env_vars(value)) + + elif isinstance(obj, list): + for item in obj: + missing.extend(self._collect_missing_env_vars(item)) + + # After all the missing variables have been collected, return the list + return missing + + + def get_missing_env_vars(self) -> list[str]: + raw_config = self._get_raw_config() + return self._collect_missing_env_vars(raw_config) + + +def _validate_config(config: dict[str, Any]) -> dict[str, Any]: + if config.get("CHAIN_ID"): + try: + config["CHAIN_ID"] = int(config["CHAIN_ID"]) + except (ValueError, TypeError) as e: + raise ConfigurationError(f"Invalid CHAIN_ID: {config['CHAIN_ID']} - must be an integer.") from e + + if config.get("SCHEDULED_RUN_TIME"): + try: + datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") + except (ValueError, TypeError) as e: + raise ConfigurationError(f"Invalid SCHEDULED_RUN_TIME: {config['SCHEDULED_RUN_TIME']} - must be HH:MM.") from e + + required = ["PRIVATE_KEY", "CONTRACT_ADDRESS", "CONTRACT_FUNCTION", "CHAIN_ID", "SCHEDULED_RUN_TIME"] + missing = [field for field in required if not config.get(field)] + if missing: + raise ConfigurationError(f"Missing required configuration fields: {', '.join(missing)}") + + return config + + +def load_config() -> dict[str, Any]: + """Loads, validates, and returns the application configuration.""" + loader = ConfigLoader() + flat_config = loader.get_flat_config() + logger.info("Successfully loaded configuration") + return _validate_config(flat_config) + + +def validate_all_required_env_vars() -> None: + """Validates that all required environment variables are set.""" + loader = ConfigLoader() + missing = loader.get_missing_env_vars() + if missing: + raise ConfigurationError(f"Missing required environment variables: {', '.join(sorted(set(missing)))}") + logger.info("Successfully validated all required environment variables") + + +# --- Credential Management --- + +class CredentialManager: + """Handles credential management for Google Cloud services.""" + + + def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: + """ + Parse and validate Google credentials JSON from environment variable. + + Args: + creds_env: JSON string containing credentials + + Returns: + dict: Parsed and validated credentials data + + Raises: + ValueError: If JSON is invalid or credentials are incomplete + """ + # Try to parse the credentials + try: + # Parse the credentials + creds_data = json.loads(creds_env) + cred_type = creds_data.get("type", "") + + # Validate the credentials data based on the type + if cred_type == "authorized_user": + required = ["client_id", "client_secret", "refresh_token"] + if not all(k in creds_data for k in required): + raise ValueError("Incomplete authorized_user credentials") + + elif cred_type == "service_account": + required = ["private_key", "client_email", "project_id"] + if not all(k in creds_data for k in required): + raise ValueError("Incomplete service_account credentials") + + else: + raise ValueError(f"Unsupported credential type: '{cred_type}'") + + return creds_data + + except Exception as e: + raise ValueError(f"Invalid credentials JSON: {e}") from e + + + def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: + """Set up user account credentials directly in memory.""" + import google.auth + from google.oauth2.credentials import Credentials + + # Try to set up the credentials + try: + credentials = Credentials( + token=None, + refresh_token=creds_data.get("refresh_token"), + client_id=creds_data.get("client_id"), + client_secret=creds_data.get("client_secret"), + token_uri="https://oauth2.googleapis.com/token", + ) + + # Set credentials globally for GCP libraries + google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] + logger.info("Successfully loaded user account credentials from environment variable") + + # Clear credentials from memory + finally: + if "creds_data" in locals(): + creds_data.clear() + + + def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None: + """Set up service account credentials directly in memory.""" + import google.auth + from google.oauth2 import service_account + + # Try to set up the credentials + try: + # Create credentials object directly from dict + credentials = service_account.Credentials.from_service_account_info(creds_data) + + # Set credentials globally for GCP libraries + google.auth._default._CREDENTIALS = credentials + logger.info("Successfully loaded service account credentials from environment variable") + + # If the credentials creation fails, raise an error + except Exception as e: + raise ValueError(f"Invalid service account credentials: {e}") from e + + # Clear the original credentials dict from memory if it exists + finally: + if "creds_data" in locals(): + creds_data.clear() + + + def setup_google_credentials(self) -> None: + """ + Set up Google credentials directly in memory from environment variable. + This function handles multiple credential formats securely: + 1. JSON string in GOOGLE_APPLICATION_CREDENTIALS (inline credentials) + 2. File path in GOOGLE_APPLICATION_CREDENTIALS + """ + # Get the account credentials from the environment variable + creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") + + # If the credentials are not set, log a warning and return + if not creds_env: + logger.warning("GOOGLE_APPLICATION_CREDENTIALS not set. Falling back to gcloud CLI.") + return + + # Case 1: JSON credentials provided inline + if creds_env.strip().startswith("{"): + creds_data = None + try: + # Parse and validate the credentials + creds_data = self._parse_and_validate_credentials_json(creds_env) + + # Set up the credentials based on the type + if creds_data.get("type") == "authorized_user": + self._setup_user_credentials_in_memory(creds_data.copy()) + else: + self._setup_service_account_credentials_in_memory(creds_data.copy()) + + # If the credentials parsing fails, raise an error + except Exception as e: + raise ValueError(f"Error processing inline credentials: {e}") from e + + # Clear the credentials from memory + finally: + if creds_data: + creds_data.clear() + + # Case 2: File path provided + elif not os.path.exists(creds_env): + logger.warning(f"GOOGLE_APPLICATION_CREDENTIALS is not valid JSON or a file path.") + logger.warning("Falling back to gcloud CLI authentication if available.") + +# Global instance for easy access +credential_manager = CredentialManager() \ No newline at end of file From e6d87302d77fcd0fabc5d5eacbad0973aa442b41 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 18:38:11 +0200 Subject: [PATCH 34/48] Consolidate config_manager and config_loader into singular configuration module --- src/models/scheduler.py | 4 +- src/models/service_quality_oracle.py | 2 +- src/models/subgraph_data_access_provider.py | 3 +- src/utils/config_loader.py | 319 -------------------- src/utils/config_manager.py | 219 -------------- 5 files changed, 3 insertions(+), 544 deletions(-) delete mode 100644 src/utils/config_loader.py delete mode 100644 src/utils/config_manager.py diff --git a/src/models/scheduler.py b/src/models/scheduler.py index 037df47..0c962b7 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -9,8 +9,7 @@ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential import src.models.service_quality_oracle as oracle -from src.utils.config_loader import load_config -from src.utils.config_manager import credential_manager +from src.utils.configuration import credential_manager, load_config, validate_all_required_env_vars from src.utils.slack_notifier import create_slack_notifier # Configure logging @@ -143,7 +142,6 @@ def initialize(self): """Initialize the scheduler and validate configuration""" logger.info("Initializing scheduler...") try: - from src.utils.config_loader import validate_all_required_env_vars validate_all_required_env_vars() credential_manager.setup_google_credentials() diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index ac15422..0e59238 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -20,7 +20,7 @@ # Import data access utilities with absolute import from src.models.blockchain_client import BlockchainClient from src.models.data_processor import DataProcessor -from src.utils.config_loader import load_config +from src.utils.configuration import load_config from src.utils.slack_notifier import create_slack_notifier # Set up basic logging diff --git a/src/models/subgraph_data_access_provider.py b/src/models/subgraph_data_access_provider.py index 653d41c..6a60ae8 100644 --- a/src/models/subgraph_data_access_provider.py +++ b/src/models/subgraph_data_access_provider.py @@ -30,8 +30,7 @@ def __init__(self): Initialize the subgraph provider. Automatically loads configuration from config loader. """ - # Import here to avoid circular imports - from src.utils.config_loader import load_config + from src.utils.configuration import load_config # Load configuration config = load_config() diff --git a/src/utils/config_loader.py b/src/utils/config_loader.py deleted file mode 100644 index 757c22f..0000000 --- a/src/utils/config_loader.py +++ /dev/null @@ -1,319 +0,0 @@ -""" -Configuration Loader for Service Quality Oracle - -This module implements TOML + environment variables: -- Config is defined in TOML -- Sensitive values are loaded from environment variables - -Benefits: -- Clear separation between structure and sensitive data -- Production-ready for Docker -- Environment variable substitution with $VARIABLE_NAME syntax -""" - -import logging -import os -import re -import sys -from pathlib import Path -from typing import Any, Optional -from datetime import datetime - -# Handle Python version compatibility for TOML loading -if sys.version_info >= (3, 11): - import tomllib -else: - import tomli as tomllib - -logger = logging.getLogger(__name__) - - -class ConfigurationError(Exception): - """Raised when configuration loading fails.""" - - pass - - -class ConfigLoader: - """Configuration loader with environment variable substitution""" - - def __init__(self, config_path: Optional[str] = None): - """Initialize the config loader""" - self.config_path = config_path or self._get_default_config_path() - self._env_var_pattern = re.compile(r"\$([A-Z_][A-Z0-9_]*)") - - - def _get_default_config_path(self) -> str: - """Get the default configuration template path.""" - # Check if we're in a Docker container - docker_path = Path("/app/config.toml") - if docker_path.exists(): - return str(docker_path) - - # For local development, look in project root - current_path = Path(__file__).parent - while current_path != current_path.parent: - config_path = current_path / "config.toml" - if config_path.exists(): - return str(config_path) - current_path = current_path.parent - - raise ConfigurationError("Could not find config.toml in project root or Docker container") - - # TODO: check this... - - - def _substitute_env_vars(self, config_toml: Any) -> Any: - """ - Recursively substitute environment variables in the config. - - Supports $VARIABLE_NAME syntax for environment variable substitution. - - Args: - config_toml: config file to process - - Returns: - Processed config with environment variables substituted - - Raises: - ConfigurationError: If required environment variable is missing - """ - if isinstance(config_toml, str): - # Find all environment variable references - env_vars = self._env_var_pattern.findall(config_toml) - - for env_var in env_vars: - env_value = os.getenv(env_var) - if env_value is None: - raise ConfigurationError(f"Required environment variable {env_var} is not set") - - # Replace the environment variable reference with actual value - config_toml = config_toml.replace(f"${env_var}", env_value) - - return config_toml - - elif isinstance(config_toml, dict): - return {k: self._substitute_env_vars(v) for k, v in config_toml.items()} - - elif isinstance(config_toml, list): - return [self._substitute_env_vars(item) for item in config_toml] - - else: - return config_toml - - - def load_config(self) -> dict[str, Any]: - """ - Load configuration from config.toml and substitute environment variables. - - Returns: - Dictionary containing the complete configuration with secrets loaded - from environment variables - - Raises: - ConfigurationError: If config file is missing or env vars are missing - """ - try: - # Load the TOML configuration - with open(self.config_path, "rb") as f: - config = tomllib.load(f) - - logger.info(f"Loaded configuration from: {self.config_path}") - - # Substitute environment variables throughout the configuration - config = self._substitute_env_vars(config) - - logger.info("Successfully loaded configuration with environment variables") - return config - - except FileNotFoundError as e: - raise ConfigurationError(f"Configuration not found: {self.config_path}") from e - except ConfigurationError: - raise - except Exception as e: - error_context = "parse configuration" if "tomllib" in str(e) else "substitute environment variables" - raise ConfigurationError(f"Failed to {error_context}: {e}") from e - - - def validate_required_env_vars(self) -> None: - """ - Validate that all required environment variables are set without loading full config. - - This can be used for early validation in startup scripts. - - Raises: - ConfigurationError: If any required environment variables are missing - """ - # Load the config file - try: - with open(self.config_path, "rb") as f: - config = tomllib.load(f) - - # If there is an error, raise a ConfigurationError - except Exception as e: - raise ConfigurationError(f"Cannot validate env vars - config error: {e}") from e - - # Collect all missing environment variables from config object - missing_vars = self._collect_missing_env_vars(config) - - # If there are missing variables, raise a ConfigurationError - if missing_vars: - raise ConfigurationError( - f"Missing required environment variables: {', '.join(sorted(set(missing_vars)))}" - ) - - - def _collect_missing_env_vars(self, obj: Any) -> list[str]: - """ - Collect all missing environment variables from config object. - - Args: - obj: config object to collect missing environment variables from - - Returns: - list of missing environment variables (if any) - """ - missing_vars = [] - # Collect the missing enviroment vaiables using the appropriate speicifc method - if isinstance(obj, str): - env_vars = self._env_var_pattern.findall(obj) - for var in env_vars: - if os.getenv(var) is None: - missing_vars.append(var) - - elif isinstance(obj, dict): - for value in obj.values(): - missing_vars.extend(self._collect_missing_env_vars(value)) - - elif isinstance(obj, list): - for item in obj: - missing_vars.extend(self._collect_missing_env_vars(item)) - - # After all the missing variables have been collected, return the list - return missing_vars - - - def get_flat_config(self) -> dict[str, Any]: - """ - Get configuration in flat format. - - Returns: - Flat dictionary with all configuration values - """ - config = self.load_config() - - # fmt: off - # Convert nested structure to flat format - flat_config = { - # BigQuery settings - "BIGQUERY_LOCATION": config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), - "BIGQUERY_PROJECT_ID": config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), - "BIGQUERY_DATASET_ID": config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), - - # Blockchain settings - "CONTRACT_ADDRESS": config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), - "CONTRACT_FUNCTION": config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), - "CHAIN_ID": config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), - "RPC_PROVIDERS": self._parse_rpc_urls(config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), - - # Scheduling - "SCHEDULED_RUN_TIME": config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), - - # Subgraph URLs - "SUBGRAPH_URL": config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), - - # Processing settings - "BATCH_SIZE": config.get("processing", {}).get("BATCH_SIZE", 125), - "MAX_AGE_BEFORE_DELETION": config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), - - # Secrets - "GOOGLE_APPLICATION_CREDENTIALS": config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), - "PRIVATE_KEY": config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), - "STUDIO_API_KEY": config.get("secrets", {}).get("STUDIO_API_KEY"), - "SLACK_WEBHOOK_URL": config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), - } - # fmt: on - - return flat_config - - - def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: - """Parse RPC URLs from list format.""" - if not rpc_urls: - raise ConfigurationError("BLOCKCHAIN_RPC_URLS is required") - - if not isinstance(rpc_urls, list) or not all(isinstance(url, str) for url in rpc_urls): - raise ConfigurationError("RPC URLs must be a list of strings") - - valid_providers = [url.strip() for url in rpc_urls if url.strip()] - if not valid_providers: - raise ConfigurationError("No valid RPC providers found") - - return valid_providers - - -def _validate_config(config: dict[str, Any]) -> dict[str, Any]: - """Helper function to validate the loaded configuration.""" - # Validate and convert chain_id to integer - if config.get("CHAIN_ID"): - try: - config["CHAIN_ID"] = int(config["CHAIN_ID"]) - except (ValueError, TypeError) as e: - raise ConfigurationError( - f"Invalid BLOCKCHAIN_CHAIN_ID: {config['CHAIN_ID']} - must be an integer." - ) from e - - # Validate scheduled run time format (HH:MM) - if config.get("SCHEDULED_RUN_TIME"): - try: - datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") - except (ValueError, TypeError) as e: - raise ConfigurationError( - f"Invalid SCHEDULED_RUN_TIME format: {config['SCHEDULED_RUN_TIME']} - must be in HH:MM format" - ) from e - - # Validate required fields - required_fields = [ - "PRIVATE_KEY", - "CONTRACT_ADDRESS", - "CONTRACT_FUNCTION", - "CHAIN_ID", - "SCHEDULED_RUN_TIME", - ] - missing_fields = [field for field in required_fields if not config.get(field)] - if missing_fields: - raise ConfigurationError(f"Missing required configuration fields: {', '.join(missing_fields)}") - - # Validate RPC providers - if not config.get("RPC_PROVIDERS") or not isinstance(config["RPC_PROVIDERS"], list): - raise ConfigurationError("BLOCKCHAIN_RPC_URLS must be a list of valid RPC URLs") - - return config - - -def load_config() -> dict[str, Any]: - """ - Convenience function to load and validate configuration. - - Returns configuration in flat format compatible with existing codebase. - - Returns: - Dictionary containing configuration with secrets from environment variables - - Raises: - ConfigurationError: If configuration loading or validation fails - """ - loader = ConfigLoader() - flat_config = loader.get_flat_config() - return _validate_config(flat_config) - - -def validate_all_required_env_vars() -> None: - """ - Validate that all required environment variables are set. - - Raises: - ConfigurationError: If any required environment variables are missing - """ - loader = ConfigLoader() - loader.validate_required_env_vars() diff --git a/src/utils/config_manager.py b/src/utils/config_manager.py deleted file mode 100644 index b40d280..0000000 --- a/src/utils/config_manager.py +++ /dev/null @@ -1,219 +0,0 @@ -""" -Centralized configuration manager with validation and credential handling. -""" - -import json -import logging -import os -from pathlib import Path -from typing import Any - -logger = logging.getLogger(__name__) - - -class CredentialManager: - """Handles credential management for Google Cloud services.""" - - def __init__(self): - pass - - - def _validate_required_fields(self, data: dict, required_fields: list[str], context: str) -> None: - """ - Helper function to validate required fields are present in a dictionary. - - Args: - data: Dictionary to validate - required_fields: List of required fields - context: Context for error message - - Raises: - ValueError: If required fields are missing - """ - # Validate that all required fields are present in the data - missing_fields = [field for field in required_fields if field not in data] - - # If any required fields are missing, raise an error - if missing_fields: - raise ValueError(f"{context}: missing {missing_fields}") - - - def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: - """ - Parse and validate Google credentials JSON from environment variable. - - Args: - creds_env: JSON string containing credentials - - Returns: - dict: Parsed and validated credentials data - - Raises: - ValueError: If JSON is invalid or credentials are incomplete - """ - # Try to parse the credentials - try: - # Parse the credentials - creds_data = json.loads(creds_env) - cred_type = creds_data.get("type", "") - - # Validate the credentials data based on the type - if cred_type == "authorized_user": - required_fields = ["client_id", "client_secret", "refresh_token"] - self._validate_required_fields( - creds_data, required_fields, "Incomplete authorized_user credentials" - ) - - elif cred_type == "service_account": - required_fields = ["private_key", "client_email", "project_id"] - self._validate_required_fields( - creds_data, required_fields, "Incomplete service_account credentials" - ) - - else: - raise ValueError( - f"Unsupported credential type: '{cred_type}'. Expected 'authorized_user' or 'service_account'" - ) - - # If the credentials parsing fails, raise an error - except Exception as e: - logger.error(f"Failed to parse and validate credentials JSON: {e}") - raise ValueError(f"Invalid credentials JSON: {e}") from e - - # Return the parsed credentials - return creds_data - - - def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: - """Set up user account credentials directly in memory.""" - import google.auth - from google.oauth2.credentials import Credentials - - # Try to set up the credentials - try: - credentials = Credentials( - token=None, - refresh_token=creds_data.get("refresh_token"), - client_id=creds_data.get("client_id"), - client_secret=creds_data.get("client_secret"), - token_uri="https://oauth2.googleapis.com/token", - ) - - # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] - logger.info("Successfully loaded user account credentials from environment variable") - - # Clear credentials from memory - finally: - if "creds_data" in locals(): - creds_data.clear() - - - def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None: - """Set up service account credentials directly in memory.""" - import google.auth - from google.oauth2 import service_account - - # Try to set up the credentials - try: - # Create credentials object directly from dict - credentials = service_account.Credentials.from_service_account_info(creds_data) - - # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] - logger.info("Successfully loaded service account credentials from environment variable") - - # If the credentials creation fails, raise an error - except Exception as e: - logger.error(f"Failed to create service account credentials: {e}") - raise ValueError(f"Invalid service account credentials: {e}") from e - - # Clear the original credentials dict from memory if it exists - finally: - if "creds_data" in locals(): - creds_data.clear() - - - def setup_google_credentials(self) -> None: - """ - Set up Google credentials directly in memory from environment variable. - This function handles multiple credential formats securely: - 1. JSON string in GOOGLE_APPLICATION_CREDENTIALS (inline credentials) - 2. File path in GOOGLE_APPLICATION_CREDENTIALS - 3. Automatic fallback to gcloud CLI authentication - """ - # Get the account credentials from the environment variable - creds_env = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") - - # If the credentials are not set, log a warning and return - if not creds_env: - logger.warning( - "GOOGLE_APPLICATION_CREDENTIALS not set. Falling back to gcloud CLI user credentials if available" - ) - return - - # Case 1: JSON credentials provided inline - if creds_env.strip().startswith("{"): - creds_data = None - try: - # Parse and validate credentials - creds_data = self._parse_and_validate_credentials_json(creds_env) - cred_type = creds_data.get("type") - - # Set up credentials based on type - if cred_type == "authorized_user": - self._setup_user_credentials_in_memory(creds_data.copy()) - elif cred_type == "service_account": - self._setup_service_account_credentials_in_memory(creds_data.copy()) - - except Exception as e: - logger.error("Failed to set up credentials from environment variable") - raise ValueError(f"Error processing inline credentials: {e}") from e - finally: - if creds_data is not None: - creds_data.clear() - - # Case 2: File path provided - elif os.path.exists(creds_env): - logger.info(f"Using credentials file: {creds_env}") - - # Case 3: Invalid format - else: - logger.warning( - f"GOOGLE_APPLICATION_CREDENTIALS appears to be neither valid JSON " - f"nor existing file path: {creds_env[:50]}..." - ) - logger.warning("Falling back to gcloud CLI authentication if available") - - - def validate_google_credentials(self) -> bool: - """ - Validate that Google credentials are properly configured and working. - - Returns: - bool: True if credentials are valid and working - """ - # Try to validate the credentials - try: - import google.auth - - credentials, project = google.auth.default() - - # If the credentials are valid, log the success and return True - if credentials: - logger.info(f"Google credentials validated successfully for project: {project}") - return True - - # If the credentials are not valid, log the error and return False - else: - logger.error("No valid Google credentials found") - return False - - # If the credentials could not be validated log the error - except Exception as e: - logger.error(f"Google credentials validation failed: {e}") - return False - - -# Global instances for easy access -credential_manager = CredentialManager() From f3a93e2b56a4be9148cbeeb372ae728cb293f6d1 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 19:41:25 +0200 Subject: [PATCH 35/48] rename functions --- src/utils/configuration.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils/configuration.py b/src/utils/configuration.py index af358d7..6a89572 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -285,8 +285,8 @@ def _parse_and_validate_credentials_json(self, creds_env: str) -> dict: raise ValueError(f"Invalid credentials JSON: {e}") from e - def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: - """Set up user account credentials directly in memory.""" + def _setup_user_credentials_from_dict(self, creds_data: dict) -> None: + """Set up user account credentials directly from a dictionary.""" import google.auth from google.oauth2.credentials import Credentials @@ -310,8 +310,8 @@ def _setup_user_credentials_in_memory(self, creds_data: dict) -> None: creds_data.clear() - def _setup_service_account_credentials_in_memory(self, creds_data: dict) -> None: - """Set up service account credentials directly in memory.""" + def _setup_service_account_credentials_from_dict(self, creds_data: dict) -> None: + """Set up service account credentials directly from a dictionary.""" import google.auth from google.oauth2 import service_account @@ -358,9 +358,9 @@ def setup_google_credentials(self) -> None: # Set up the credentials based on the type if creds_data.get("type") == "authorized_user": - self._setup_user_credentials_in_memory(creds_data.copy()) + self._setup_user_credentials_from_dict(creds_data.copy()) else: - self._setup_service_account_credentials_in_memory(creds_data.copy()) + self._setup_service_account_credentials_from_dict(creds_data.copy()) # If the credentials parsing fails, raise an error except Exception as e: From eb64bc150c3ed387ee59416fb05d10d193d7ac40 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:11:11 +0200 Subject: [PATCH 36/48] audit cp6 --- src/models/service_quality_oracle.py | 45 ++++++++++++++++++---------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 0e59238..3c6e016 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -12,15 +12,17 @@ import sys import time from datetime import date, timedelta +from pathlib import Path # Add project root to path -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) -sys.path.insert(0, project_root) +project_root_path = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(project_root_path)) # Import data access utilities with absolute import +from src.models.bigquery_data_access_provider import BigQueryProvider from src.models.blockchain_client import BlockchainClient from src.models.data_processor import DataProcessor -from src.utils.configuration import load_config +from src.utils.configuration import credential_manager, load_config from src.utils.slack_notifier import create_slack_notifier # Set up basic logging @@ -45,7 +47,8 @@ def main(run_date_override: date = None): stage = "Initialization" try: - # Load configuration to get Slack webhook and other settings + # Configuration and credentials + credential_manager.setup_google_credentials() config = load_config() slack_notifier = create_slack_notifier(config.get("SLACK_WEBHOOK_URL")) if slack_notifier: @@ -55,24 +58,36 @@ def main(run_date_override: date = None): # Define the date for the current run current_run_date = run_date_override or date.today() - - # Fetch + save indexer eligibility data and return eligible list - stage = "Data Processing" - data_processor = DataProcessor(config) - eligible_indexers = data_processor.process_and_get_eligible_indexers( - start_date=current_run_date - timedelta(days=28), - end_date=current_run_date, - current_date=current_run_date, + start_date = current_run_date - timedelta(days=config["BIGQUERY_ANALYSIS_PERIOD_DAYS"]) + end_date = current_run_date + + # --- Data Fetching Stage --- + stage = "Data Fetching from BigQuery" + logger.info(f"Fetching data from {start_date} to {end_date}") + bigquery_provider = BigQueryProvider(project=config["BIGQUERY_PROJECT"], location=config["BIGQUERY_LOCATION"]) + eligibility_data = bigquery_provider.fetch_indexer_issuance_eligibility_data(start_date, end_date) + logger.info(f"Successfully fetched data for {len(eligibility_data)} indexers from BigQuery.") + + # --- Data Processing Stage --- + stage = "Data Processing and Artifact Generation" + data_processor = DataProcessor(project_root=project_root_path) + output_date_dir = data_processor.get_date_output_directory(current_run_date) + eligible_indexers, _ = data_processor.export_bigquery_data_as_csvs_and_return_indexer_lists( + input_data_from_bigquery=eligibility_data, + output_date_dir=output_date_dir, ) logger.info(f"Found {len(eligible_indexers)} eligible indexers.") - - data_processor.clean_old_date_directories(config["MAX_AGE_BEFORE_DELETION"]) + data_processor.clean_old_date_directories(config["MAX_AGE_BEFORE_DELETION"]) # --- Blockchain Submission Stage --- stage = "Blockchain Submission" logger.info("Instantiating BlockchainClient...") - blockchain_client = BlockchainClient() + blockchain_client = BlockchainClient( + rpc_providers=config["RPC_PROVIDERS"], + contract_address=config["CONTRACT_ADDRESS"], + project_root=project_root_path, + ) transaction_links = blockchain_client.batch_allow_indexers_issuance_eligibility( indexer_addresses=eligible_indexers, private_key=config["PRIVATE_KEY"], From 9327598efa4f21df422f6e0efdfac7bbd893cbd2 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:22:07 +0200 Subject: [PATCH 37/48] rename data_processor to eligibility_pipeline --- ...a_processor.py => eligibility_pipeline.py} | 75 ++++++++++++------- src/models/service_quality_oracle.py | 11 ++- 2 files changed, 52 insertions(+), 34 deletions(-) rename src/models/{data_processor.py => eligibility_pipeline.py} (70%) diff --git a/src/models/data_processor.py b/src/models/eligibility_pipeline.py similarity index 70% rename from src/models/data_processor.py rename to src/models/eligibility_pipeline.py index 47c5b13..0530c44 100644 --- a/src/models/data_processor.py +++ b/src/models/eligibility_pipeline.py @@ -1,10 +1,10 @@ """ -Data processing utility module for Service Quality Oracle. +Eligibility pipeline module for the Service Quality Oracle. -This module handles data processing operations including: -- CSV export and file management -- Data cleaning and directory maintenance -- Indexer data filtering and organization +This module contains the logic for processing raw BigQuery data into a list of eligible indexers. It handles: +- Parsing and filtering of indexer performance data. +- Generation of CSV files for record-keeping. +- Cleanup of old data. """ import logging @@ -18,12 +18,12 @@ logger = logging.getLogger(__name__) -class DataProcessor: - """Handles data processing and file management operations.""" +class EligibilityPipeline: + """Handles the data processing pipeline and file management operations.""" def __init__(self, project_root: Path): """ - Initialize the data processor. + Initialize the eligibility pipeline. Args: project_root: Path to project root directory @@ -33,30 +33,59 @@ def __init__(self, project_root: Path): self.output_dir = project_root / "data" / "output" - def export_bigquery_data_as_csvs_and_return_indexer_lists( - self, input_data_from_bigquery: pd.DataFrame, output_date_dir: Path + def process( + self, input_data_from_bigquery: pd.DataFrame, current_date: date ) -> Tuple[List[str], List[str]]: """ - Export BigQuery data as CSVs and return lists of eligible/ineligible indexers. + Process raw BigQuery data to generate data and return eligible indexer lists. Args: - input_data_from_bigquery: Indexer data returned from BigQuery - output_date_dir: Path to date directory for output files + input_data_from_bigquery: DataFrame from BigQuery. + current_date: The date of the current run, used for creating the output directory. Returns: Tuple[List[str], List[str]]: Two lists of indexer addresses, eligible and ineligible """ + # 1. Validate the structure of the input data + required_cols = ["indexer", "eligible_for_indexing_rewards"] + self.validate_dataframe_structure(input_data_from_bigquery, required_cols) + + # 2. Generate and save files + output_date_dir = self.get_date_output_directory(current_date) + self._generate_files(input_data_from_bigquery, output_date_dir) + + # 3. Filter and return the lists of indexers + eligible_df = input_data_from_bigquery[ + input_data_from_bigquery["eligible_for_indexing_rewards"] == 1 + ] + ineligible_df = input_data_from_bigquery[ + input_data_from_bigquery["eligible_for_indexing_rewards"] == 0 + ] + + return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() + + def _generate_files(self, data: pd.DataFrame, output_date_dir: Path) -> None: + """ + Save the raw and filtered dataframes to CSV files in a date-specific directory. + - indexer_issuance_eligibility_data.csv (raw data) + - eligible_indexers.csv (only eligible indexer addresses) + - ineligible_indexers.csv (only ineligible indexer addresses) + + Args: + data: The input DataFrame containing all indexer data. + output_date_dir: The directory where artifacts will be saved. + """ # Ensure the output directory exists, creating parent directories if necessary output_date_dir.mkdir(exist_ok=True, parents=True) # Save raw data for internal use raw_data_path = output_date_dir / "indexer_issuance_eligibility_data.csv" - input_data_from_bigquery.to_csv(raw_data_path, index=False) - logger.info(f"Saved raw bigquery results df to: {raw_data_path}") + data.to_csv(raw_data_path, index=False) + logger.info(f"Saved raw BigQuery results to: {raw_data_path}") - # Filter eligible and ineligible indexers - eligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 1] - ineligible_df = input_data_from_bigquery[input_data_from_bigquery["eligible_for_indexing_rewards"] == 0] + # Filter and save eligible/ineligible indexer lists + eligible_df = data[data["eligible_for_indexing_rewards"] == 1] + ineligible_df = data[data["eligible_for_indexing_rewards"] == 0] # Save filtered data eligible_path = output_date_dir / "eligible_indexers.csv" @@ -68,9 +97,6 @@ def export_bigquery_data_as_csvs_and_return_indexer_lists( logger.info(f"Saved {len(eligible_df)} eligible indexers to: {eligible_path}") logger.info(f"Saved {len(ineligible_df)} ineligible indexers to: {ineligible_path}") - # Return lists of eligible and ineligible indexers - return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() - def clean_old_date_directories(self, max_age_before_deletion: int) -> None: """ @@ -128,13 +154,6 @@ def get_date_output_directory(self, current_date: date) -> Path: return self.output_dir / current_date.strftime("%Y-%m-%d") - def ensure_output_directory_exists(self) -> None: - """Ensure the main output directory exists.""" - # Create the output directory if it doesn't exist - self.output_dir.mkdir(exist_ok=True, parents=True) - logger.debug(f"Ensured output directory exists: {self.output_dir}") - - def validate_dataframe_structure(self, df: pd.DataFrame, required_columns: List[str]) -> bool: """ Validate that a DataFrame has the required columns. diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 3c6e016..77ade8f 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -21,7 +21,7 @@ # Import data access utilities with absolute import from src.models.bigquery_data_access_provider import BigQueryProvider from src.models.blockchain_client import BlockchainClient -from src.models.data_processor import DataProcessor +from src.models.eligibility_pipeline import EligibilityPipeline from src.utils.configuration import credential_manager, load_config from src.utils.slack_notifier import create_slack_notifier @@ -70,15 +70,14 @@ def main(run_date_override: date = None): # --- Data Processing Stage --- stage = "Data Processing and Artifact Generation" - data_processor = DataProcessor(project_root=project_root_path) - output_date_dir = data_processor.get_date_output_directory(current_run_date) - eligible_indexers, _ = data_processor.export_bigquery_data_as_csvs_and_return_indexer_lists( + pipeline = EligibilityPipeline(project_root=project_root_path) + eligible_indexers, _ = pipeline.process( input_data_from_bigquery=eligibility_data, - output_date_dir=output_date_dir, + current_date=current_run_date, ) logger.info(f"Found {len(eligible_indexers)} eligible indexers.") - data_processor.clean_old_date_directories(config["MAX_AGE_BEFORE_DELETION"]) + pipeline.clean_old_date_directories(config["MAX_AGE_BEFORE_DELETION"]) # --- Blockchain Submission Stage --- stage = "Blockchain Submission" From 8e5c8445b65a41288200d3c0e35f27652833a783 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:25:58 +0200 Subject: [PATCH 38/48] Minor refactor for efficiency --- src/models/eligibility_pipeline.py | 36 +++++++++++++++++------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/models/eligibility_pipeline.py b/src/models/eligibility_pipeline.py index 0530c44..7bd83a7 100644 --- a/src/models/eligibility_pipeline.py +++ b/src/models/eligibility_pipeline.py @@ -50,21 +50,25 @@ def process( required_cols = ["indexer", "eligible_for_indexing_rewards"] self.validate_dataframe_structure(input_data_from_bigquery, required_cols) - # 2. Generate and save files - output_date_dir = self.get_date_output_directory(current_date) - self._generate_files(input_data_from_bigquery, output_date_dir) - - # 3. Filter and return the lists of indexers + # 2. Filter data into eligible and ineligible groups eligible_df = input_data_from_bigquery[ input_data_from_bigquery["eligible_for_indexing_rewards"] == 1 - ] + ].copy() + ineligible_df = input_data_from_bigquery[ input_data_from_bigquery["eligible_for_indexing_rewards"] == 0 - ] + ].copy() + # 3. Generate and save files + output_date_dir = self.get_date_output_directory(current_date) + self._generate_files(input_data_from_bigquery, eligible_df, ineligible_df, output_date_dir) + + # 4. Return the lists of indexers return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() - def _generate_files(self, data: pd.DataFrame, output_date_dir: Path) -> None: + def _generate_files( + self, raw_data: pd.DataFrame, eligible_df: pd.DataFrame, ineligible_df: pd.DataFrame, output_date_dir: Path + ) -> None: """ Save the raw and filtered dataframes to CSV files in a date-specific directory. - indexer_issuance_eligibility_data.csv (raw data) @@ -72,21 +76,19 @@ def _generate_files(self, data: pd.DataFrame, output_date_dir: Path) -> None: - ineligible_indexers.csv (only ineligible indexer addresses) Args: - data: The input DataFrame containing all indexer data. - output_date_dir: The directory where artifacts will be saved. + raw_data: The input DataFrame containing all indexer data. + eligible_df: DataFrame containing only eligible indexers. + ineligible_df: DataFrame containing only ineligible indexers. + output_date_dir: The directory where files will be saved. """ # Ensure the output directory exists, creating parent directories if necessary output_date_dir.mkdir(exist_ok=True, parents=True) # Save raw data for internal use raw_data_path = output_date_dir / "indexer_issuance_eligibility_data.csv" - data.to_csv(raw_data_path, index=False) + raw_data.to_csv(raw_data_path, index=False) logger.info(f"Saved raw BigQuery results to: {raw_data_path}") - # Filter and save eligible/ineligible indexer lists - eligible_df = data[data["eligible_for_indexing_rewards"] == 1] - ineligible_df = data[data["eligible_for_indexing_rewards"] == 0] - # Save filtered data eligible_path = output_date_dir / "eligible_indexers.csv" ineligible_path = output_date_dir / "ineligible_indexers.csv" @@ -173,7 +175,9 @@ def validate_dataframe_structure(self, df: pd.DataFrame, required_columns: List[ # If any required columns are missing, raise an error if missing_columns: - raise ValueError(f"DataFrame missing required columns: {missing_columns}") + raise ValueError( + f"DataFrame missing required columns: {missing_columns}. " f"Found columns: {list(df.columns)}" + ) # If all required columns are present, return True return True From 522c97ac859df05d7337f533c16a33b147844846 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:40:23 +0200 Subject: [PATCH 39/48] Update config.toml.example --- config.toml.example | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.toml.example b/config.toml.example index 8cb5a26..34d9071 100644 --- a/config.toml.example +++ b/config.toml.example @@ -9,6 +9,7 @@ BIGQUERY_LOCATION_ID = "" BIGQUERY_PROJECT_ID = "" BIGQUERY_DATASET_ID = "" +BIGQUERY_ANALYSIS_PERIOD_DAYS = "28" [blockchain] BLOCKCHAIN_CONTRACT_ADDRESS = "" @@ -20,6 +21,8 @@ BLOCKCHAIN_RPC_URLS = [ "", "" ] +BLOCK_EXPLORER_URL = "https://sepolia.arbiscan.io" +TX_TIMEOUT_SECONDS = "30" [scheduling] SCHEDULED_RUN_TIME = "10:00" From 57910baa718038bf9882f4623ff235a302383e1d Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:40:52 +0200 Subject: [PATCH 40/48] audit cp7 --- src/models/blockchain_client.py | 124 ++++++++++----------------- src/models/service_quality_oracle.py | 2 + 2 files changed, 47 insertions(+), 79 deletions(-) diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index 76468a2..7ebaa97 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -25,7 +25,14 @@ class BlockchainClient: """Handles all blockchain interactions""" - def __init__(self, rpc_providers: List[str], contract_address: str, project_root: Path): + def __init__( + self, + rpc_providers: List[str], + contract_address: str, + project_root: Path, + block_explorer_url: str, + tx_timeout_seconds: int, + ): """ Initialize the blockchain client. @@ -33,10 +40,14 @@ def __init__(self, rpc_providers: List[str], contract_address: str, project_root rpc_providers: List of RPC provider URLs contract_address: Smart contract address project_root: Path to project root directory + block_explorer_url: Base URL for the block explorer (e.g., https://sepolia.arbiscan.io) + tx_timeout_seconds: Seconds to wait for a transaction receipt. """ self.rpc_providers = rpc_providers self.contract_address = contract_address self.project_root = project_root + self.block_explorer_url = block_explorer_url.rstrip("/") + self.tx_timeout_seconds = tx_timeout_seconds self.contract_abi = self._load_contract_abi() @@ -103,19 +114,18 @@ def _get_working_web3_connection( raise ConnectionError(f"Failed to connect to any of {len(rpc_providers)} RPC providers: {rpc_providers}") - def _setup_transaction_account(self, private_key: str, w3: Web3) -> str: + def _setup_transaction_account(self, private_key: str) -> str: """ Get the address of the account from the private key. Args: private_key: Private key for the account - w3: Web3 instance Returns: str: Address of the account """ try: - account = w3.eth.account.from_key(private_key) + account = Web3().eth.account.from_key(private_key) logger.info(f"Using account: {account.address}") return account.address @@ -352,60 +362,6 @@ def _send_signed_transaction(self, w3: Web3, signed_tx: Any) -> str: raise - def _build_and_send_transaction( - self, - w3: Web3, - contract_func: Any, - indexer_addresses: List[str], - data_bytes: bytes, - sender_address: str, - private_key: str, - chain_id: int, - gas_limit: int, - nonce: int, - replace: bool, - ) -> str: - """ - Build, sign, and send the transaction. - - Args: - w3: Web3 instance - contract_func: Contract function to call - indexer_addresses: List of indexer addresses - data_bytes: Data bytes for transaction - sender_address: Transaction sender address - private_key: Private key for signing - chain_id: Chain ID - gas_limit: Gas limit for transaction - nonce: Transaction nonce - replace: Whether this is a replacement transaction - - Returns: - str: Transaction hash - """ - try: - # Get gas prices - base_fee, max_priority_fee = self._get_gas_prices(w3, replace) - - # Build transaction parameters - tx_params = self._build_transaction_params( - sender_address, nonce, chain_id, gas_limit, base_fee, max_priority_fee, replace - ) - - # Build and sign transaction - signed_tx = self._build_and_sign_transaction( - w3, contract_func, indexer_addresses, data_bytes, tx_params, private_key - ) - - # Send transaction - return self._send_signed_transaction(w3, signed_tx) - - # If we get an error, log the error and raise an exception - except Exception as e: - logger.error(f"Error in _build_and_send_transaction: {e}") - raise - - def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Dict) -> str: """ Execute the complete transaction process using a single RPC connection. @@ -447,27 +403,38 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di balance_eth = w3.from_wei(balance_wei, "ether") logger.info(f"Account balance: {balance_eth} ETH") - # All transaction steps with the same RPC connection - gas_limit = self._estimate_transaction_gas( - w3, contract_func, indexer_addresses, data_bytes, sender_address - ) - nonce = self._determine_transaction_nonce(w3, sender_address, replace) - tx_hash = self._build_and_send_transaction( - w3, - contract_func, - indexer_addresses, - data_bytes, - sender_address, - private_key, - chain_id, - gas_limit, - nonce, - replace, - ) + try: + # 1. Estimate gas + gas_limit = self._estimate_transaction_gas( + w3, contract_func, indexer_addresses, data_bytes, sender_address + ) + + # 2. Determine nonce + nonce = self._determine_transaction_nonce(w3, sender_address, replace) + + # 3. Get gas prices + base_fee, max_priority_fee = self._get_gas_prices(w3, replace) + + # 4. Build transaction parameters + tx_params = self._build_transaction_params( + sender_address, nonce, chain_id, gas_limit, base_fee, max_priority_fee, replace + ) + + # 5. Build and sign transaction + signed_tx = self._build_and_sign_transaction( + w3, contract_func, indexer_addresses, data_bytes, tx_params, private_key + ) + + # 6. Send transaction + tx_hash = self._send_signed_transaction(w3, signed_tx) + + except Exception as e: + logger.error(f"Transaction execution failed: {e}", exc_info=True) + raise # Wait for receipt with the same connection try: - tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=30) + tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=self.tx_timeout_seconds) if tx_receipt["status"] == 1: logger.info( f"Transaction confirmed in block {tx_receipt['blockNumber']}, " @@ -550,8 +517,7 @@ def send_transaction_to_allow_indexers( str: Transaction hash """ # Set up account - temp_w3 = Web3() - sender_address = self._setup_transaction_account(private_key, temp_w3) + sender_address = self._setup_transaction_account(private_key) # Convert addresses to checksum format checksum_addresses = [Web3.to_checksum_address(addr) for addr in indexer_addresses] @@ -639,7 +605,7 @@ def batch_allow_indexers_issuance_eligibility( replace, data_bytes, ) - tx_links.append(f"https://sepolia.arbiscan.io/tx/{tx_hash}") + tx_links.append(f"{self.block_explorer_url}/tx/{tx_hash}") logger.info(f"Batch {i+1} transaction successful: {tx_hash}") # If we get an error, log the error and raise an exception diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 77ade8f..e04898e 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -86,6 +86,8 @@ def main(run_date_override: date = None): rpc_providers=config["RPC_PROVIDERS"], contract_address=config["CONTRACT_ADDRESS"], project_root=project_root_path, + block_explorer_url=config["BLOCK_EXPLORER_URL"], + tx_timeout_seconds=config["TX_TIMEOUT_SECONDS"], ) transaction_links = blockchain_client.batch_allow_indexers_issuance_eligibility( indexer_addresses=eligible_indexers, From 48ebc2c2007a537feecd81f8c2c50f072f7c08c9 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 20:48:10 +0200 Subject: [PATCH 41/48] audit cp8 --- config.toml.example | 3 ++- src/models/service_quality_oracle.py | 2 +- src/utils/configuration.py | 9 ++++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/config.toml.example b/config.toml.example index 34d9071..79fb6ef 100644 --- a/config.toml.example +++ b/config.toml.example @@ -9,7 +9,7 @@ BIGQUERY_LOCATION_ID = "" BIGQUERY_PROJECT_ID = "" BIGQUERY_DATASET_ID = "" -BIGQUERY_ANALYSIS_PERIOD_DAYS = "28" + [blockchain] BLOCKCHAIN_CONTRACT_ADDRESS = "" @@ -34,6 +34,7 @@ SUBGRAPH_URL_PRODUCTION = "" [processing] BATCH_SIZE = 125 MAX_AGE_BEFORE_DELETION = 120 +BIGQUERY_ANALYSIS_PERIOD_DAYS = "28" # ============================================================================= # SENSITIVE CONFIGURATION diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index e04898e..561453f 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -64,7 +64,7 @@ def main(run_date_override: date = None): # --- Data Fetching Stage --- stage = "Data Fetching from BigQuery" logger.info(f"Fetching data from {start_date} to {end_date}") - bigquery_provider = BigQueryProvider(project=config["BIGQUERY_PROJECT"], location=config["BIGQUERY_LOCATION"]) + bigquery_provider = BigQueryProvider(project=config["BIGQUERY_PROJECT_ID"], location=config["BIGQUERY_LOCATION"]) eligibility_data = bigquery_provider.fetch_indexer_issuance_eligibility_data(start_date, end_date) logger.info(f"Successfully fetched data for {len(eligibility_data)} indexers from BigQuery.") diff --git a/src/utils/configuration.py b/src/utils/configuration.py index 6a89572..592c9c8 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -132,8 +132,10 @@ def get_flat_config(self) -> dict[str, Any]: # Blockchain settings "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), "CONTRACT_FUNCTION": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), - "CHAIN_ID": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID"), + "CHAIN_ID": int(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID")), "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), + "BLOCK_EXPLORER_URL": substituted_config.get("blockchain", {}).get("BLOCK_EXPLORER_URL", "https://sepolia.arbiscan.io"), + "TX_TIMEOUT_SECONDS": int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS", 30)), # Scheduling "SCHEDULED_RUN_TIME": substituted_config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), @@ -142,8 +144,9 @@ def get_flat_config(self) -> dict[str, Any]: "SUBGRAPH_URL": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), # Processing settings - "BATCH_SIZE": substituted_config.get("processing", {}).get("BATCH_SIZE", 125), - "MAX_AGE_BEFORE_DELETION": substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120), + "BATCH_SIZE": int(substituted_config.get("processing", {}).get("BATCH_SIZE", 125)), + "MAX_AGE_BEFORE_DELETION": int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120)), + "BIGQUERY_ANALYSIS_PERIOD_DAYS": int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS", 28)), # Secrets "GOOGLE_APPLICATION_CREDENTIALS": substituted_config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), From 4cbb60ff9e3c4c84c1d99f72cbeca1617fafa6d0 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:01:49 +0200 Subject: [PATCH 42/48] audit cp9 --- config.toml.example | 8 +++- src/models/bigquery_data_access_provider.py | 43 +++++++++++---------- src/models/service_quality_oracle.py | 14 ++++++- src/utils/configuration.py | 7 ++++ 4 files changed, 50 insertions(+), 22 deletions(-) diff --git a/config.toml.example b/config.toml.example index 79fb6ef..0a4a505 100644 --- a/config.toml.example +++ b/config.toml.example @@ -9,7 +9,7 @@ BIGQUERY_LOCATION_ID = "" BIGQUERY_PROJECT_ID = "" BIGQUERY_DATASET_ID = "" - +BIGQUERY_TABLE_ID = "" [blockchain] BLOCKCHAIN_CONTRACT_ADDRESS = "" @@ -36,6 +36,12 @@ BATCH_SIZE = 125 MAX_AGE_BEFORE_DELETION = 120 BIGQUERY_ANALYSIS_PERIOD_DAYS = "28" +[eligibility_criteria] +MIN_ONLINE_DAYS = "5" +MIN_SUBGRAPHS = "10" +MAX_LATENCY_MS = "5000" +MAX_BLOCKS_BEHIND = "50000" + # ============================================================================= # SENSITIVE CONFIGURATION # ============================================================================= diff --git a/src/models/bigquery_data_access_provider.py b/src/models/bigquery_data_access_provider.py index 23d6d56..e16f86b 100644 --- a/src/models/bigquery_data_access_provider.py +++ b/src/models/bigquery_data_access_provider.py @@ -20,11 +20,25 @@ class BigQueryProvider: """A class that provides read access to Google BigQuery for indexer data.""" - def __init__(self, project: str, location: str) -> None: + def __init__( + self, + project: str, + location: str, + table_name: str, + min_online_days: int, + min_subgraphs: int, + max_latency_ms: int, + max_blocks_behind: int, + ) -> None: # Configure BigQuery connection globally for all SQL queries to BigQuery bpd.options.bigquery.location = location bpd.options.bigquery.project = project bpd.options.display.progress_bar = None + self.table_name = table_name + self.min_online_days = min_online_days + self.min_subgraphs = min_subgraphs + self.max_latency_ms = max_latency_ms + self.max_blocks_behind = max_blocks_behind @retry_with_backoff(max_attempts=10, min_wait=1, max_wait=60, exceptions=(ConnectionError, socket.timeout)) @@ -39,17 +53,6 @@ def _read_gbq_dataframe(self, query: str) -> DataFrame: GOOGLE_APPLICATION_CREDENTIALS environment variable if set. This variable should point to the JSON file containing the service account key. """ - # Check if GOOGLE_APPLICATION_CREDENTIALS is set and valid - creds_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") - if creds_path: - if not os.path.exists(os.path.expanduser(creds_path)): - logger.warning(f"GOOGLE_APPLICATION_CREDENTIALS path not found: {creds_path}") - logger.warning("Falling back to gcloud CLI user credentials.") - else: - logger.info("Using environment variable $GOOGLE_APPLICATION_CREDENTIALS for authentication.") - else: - logger.warning("GOOGLE_APPLICATION_CREDENTIALS not set, falling back to gcloud CLI user credentials") - # Execute the query with retry logic return cast(DataFrame, bpd.read_gbq(query).to_pandas()) @@ -85,14 +88,14 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st COUNT(*) AS query_attempts, SUM(CASE WHEN status = '200 OK' - AND response_time_ms < 5000 - AND blocks_behind < 50000 + AND response_time_ms < {self.max_latency_ms} + AND blocks_behind < {self.max_blocks_behind} THEN 1 ELSE 0 END) AS good_responses, COUNT(DISTINCT deployment) AS unique_subgraphs_served FROM - internal_metrics.metrics_indexer_attempts + {self.table_name} WHERE day_partition BETWEEN '{start_date_str}' AND '{end_date_str}' GROUP BY @@ -104,7 +107,7 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st indexer, day, unique_subgraphs_served, - CASE WHEN good_responses >= 1 AND unique_subgraphs_served >= 10 + CASE WHEN good_responses >= 1 AND unique_subgraphs_served >= {self.min_subgraphs} THEN 1 ELSE 0 END AS is_online_day FROM @@ -116,12 +119,12 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st indexer, COUNT(DISTINCT deployment) AS unique_good_response_subgraphs FROM - internal_metrics.metrics_indexer_attempts + {self.table_name} WHERE day_partition BETWEEN '{start_date_str}' AND '{end_date_str}' AND status = '200 OK' - AND response_time_ms < 5000 - AND blocks_behind < 50000 + AND response_time_ms < {self.max_latency_ms} + AND blocks_behind < {self.max_blocks_behind} GROUP BY indexer ), @@ -150,7 +153,7 @@ def _get_indexer_eligibility_query(self, start_date: date, end_date: date) -> st total_good_days_online, unique_good_response_subgraphs, CASE - WHEN total_good_days_online >= 5 THEN 1 + WHEN total_good_days_online >= {self.min_online_days} THEN 1 ELSE 0 END AS eligible_for_indexing_rewards FROM diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 561453f..47c629e 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -64,7 +64,19 @@ def main(run_date_override: date = None): # --- Data Fetching Stage --- stage = "Data Fetching from BigQuery" logger.info(f"Fetching data from {start_date} to {end_date}") - bigquery_provider = BigQueryProvider(project=config["BIGQUERY_PROJECT_ID"], location=config["BIGQUERY_LOCATION"]) + + # Construct the full table name from configuration + table_name = f"{config['BIGQUERY_PROJECT_ID']}.{config['BIGQUERY_DATASET_ID']}.{config['BIGQUERY_TABLE_ID']}" + + bigquery_provider = BigQueryProvider( + project=config["BIGQUERY_PROJECT_ID"], + location=config["BIGQUERY_LOCATION"], + table_name=table_name, + min_online_days=config["MIN_ONLINE_DAYS"], + min_subgraphs=config["MIN_SUBGRAPHS"], + max_latency_ms=config["MAX_LATENCY_MS"], + max_blocks_behind=config["MAX_BLOCKS_BEHIND"], + ) eligibility_data = bigquery_provider.fetch_indexer_issuance_eligibility_data(start_date, end_date) logger.info(f"Successfully fetched data for {len(eligibility_data)} indexers from BigQuery.") diff --git a/src/utils/configuration.py b/src/utils/configuration.py index 592c9c8..21588d3 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -128,6 +128,13 @@ def get_flat_config(self) -> dict[str, Any]: "BIGQUERY_LOCATION": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), + "BIGQUERY_TABLE_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_TABLE_ID", "metrics_indexer_attempts"), + + # Eligibility Criteria + "MIN_ONLINE_DAYS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS", 5)), + "MIN_SUBGRAPHS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS", 10)), + "MAX_LATENCY_MS": int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS", 5000)), + "MAX_BLOCKS_BEHIND": int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND", 50000)), # Blockchain settings "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), From ac343635ed040cc29df2e7bbb1ac4c1fe9feedc7 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:05:57 +0200 Subject: [PATCH 43/48] Update configuration.py --- src/utils/configuration.py | 72 ++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/src/utils/configuration.py b/src/utils/configuration.py index 21588d3..57fcdbc 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -125,24 +125,24 @@ def get_flat_config(self) -> dict[str, Any]: # Convert nested structure to flat format return { # BigQuery settings - "BIGQUERY_LOCATION": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID", "US"), - "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID", "graph-mainnet"), - "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID", "internal_metrics"), - "BIGQUERY_TABLE_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_TABLE_ID", "metrics_indexer_attempts"), + "BIGQUERY_LOCATION": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID"), + "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID"), + "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID"), + "BIGQUERY_TABLE_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_TABLE_ID"), # Eligibility Criteria - "MIN_ONLINE_DAYS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS", 5)), - "MIN_SUBGRAPHS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS", 10)), - "MAX_LATENCY_MS": int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS", 5000)), - "MAX_BLOCKS_BEHIND": int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND", 50000)), + "MIN_ONLINE_DAYS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS")), + "MIN_SUBGRAPHS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS")), + "MAX_LATENCY_MS": int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS")), + "MAX_BLOCKS_BEHIND": int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND")), # Blockchain settings "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), "CONTRACT_FUNCTION": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), "CHAIN_ID": int(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID")), - "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS", [])), - "BLOCK_EXPLORER_URL": substituted_config.get("blockchain", {}).get("BLOCK_EXPLORER_URL", "https://sepolia.arbiscan.io"), - "TX_TIMEOUT_SECONDS": int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS", 30)), + "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS")), + "BLOCK_EXPLORER_URL": substituted_config.get("blockchain", {}).get("BLOCK_EXPLORER_URL"), + "TX_TIMEOUT_SECONDS": int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS")), # Scheduling "SCHEDULED_RUN_TIME": substituted_config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), @@ -151,9 +151,9 @@ def get_flat_config(self) -> dict[str, Any]: "SUBGRAPH_URL": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), # Processing settings - "BATCH_SIZE": int(substituted_config.get("processing", {}).get("BATCH_SIZE", 125)), - "MAX_AGE_BEFORE_DELETION": int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION", 120)), - "BIGQUERY_ANALYSIS_PERIOD_DAYS": int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS", 28)), + "BATCH_SIZE": int(substituted_config.get("processing", {}).get("BATCH_SIZE")), + "MAX_AGE_BEFORE_DELETION": int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION")), + "BIGQUERY_ANALYSIS_PERIOD_DAYS": int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS")), # Secrets "GOOGLE_APPLICATION_CREDENTIALS": substituted_config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), @@ -164,15 +164,14 @@ def get_flat_config(self) -> dict[str, Any]: # fmt: on - def _parse_rpc_urls(self, rpc_urls: list) -> list[str]: + def _parse_rpc_urls(self, rpc_urls: Optional[list]) -> list[str]: """Parse RPC URLs from list format.""" if not rpc_urls or not isinstance(rpc_urls, list) or not all(isinstance(url, str) for url in rpc_urls): - raise ConfigurationError("BLOCKCHAIN_RPC_URLS must be a list of valid string providers") + return [] valid_providers = [url.strip() for url in rpc_urls if url.strip()] if not valid_providers: - - raise ConfigurationError("No valid RPC providers found in BLOCKCHAIN_RPC_URLS") + return [] return valid_providers @@ -213,23 +212,28 @@ def get_missing_env_vars(self) -> list[str]: def _validate_config(config: dict[str, Any]) -> dict[str, Any]: - if config.get("CHAIN_ID"): - try: - config["CHAIN_ID"] = int(config["CHAIN_ID"]) - except (ValueError, TypeError) as e: - raise ConfigurationError(f"Invalid CHAIN_ID: {config['CHAIN_ID']} - must be an integer.") from e - - if config.get("SCHEDULED_RUN_TIME"): - try: - datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") - except (ValueError, TypeError) as e: - raise ConfigurationError(f"Invalid SCHEDULED_RUN_TIME: {config['SCHEDULED_RUN_TIME']} - must be HH:MM.") from e - - required = ["PRIVATE_KEY", "CONTRACT_ADDRESS", "CONTRACT_FUNCTION", "CHAIN_ID", "SCHEDULED_RUN_TIME"] - missing = [field for field in required if not config.get(field)] + # Define required fields. All other fields from `get_flat_config` are considered optional. + required = [ + "BIGQUERY_LOCATION", "BIGQUERY_PROJECT_ID", "BIGQUERY_DATASET_ID", "BIGQUERY_TABLE_ID", + "MIN_ONLINE_DAYS", "MIN_SUBGRAPHS", "MAX_LATENCY_MS", "MAX_BLOCKS_BEHIND", + "CONTRACT_ADDRESS", "CONTRACT_FUNCTION", "CHAIN_ID", "RPC_PROVIDERS", + "BLOCK_EXPLORER_URL", "TX_TIMEOUT_SECONDS", "SCHEDULED_RUN_TIME", + "BATCH_SIZE", "MAX_AGE_BEFORE_DELETION", "BIGQUERY_ANALYSIS_PERIOD_DAYS", + "PRIVATE_KEY", + ] + missing = [field for field in required if config.get(field) is None or config.get(field) == []] if missing: - raise ConfigurationError(f"Missing required configuration fields: {', '.join(missing)}") - + raise ConfigurationError( + f"Missing required configuration fields in config.toml or environment variables: {', '.join(sorted(missing))}" + ) + + # Validate specific field formats + try: + # The int() casts in get_flat_config will handle type errors for numeric fields. + datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") + except (ValueError, TypeError): + raise ConfigurationError(f"Invalid SCHEDULED_RUN_TIME: {config['SCHEDULED_RUN_TIME']} - must be in HH:MM format.") + return config From fbd02f8410e003ab8f2e4240d4839fc7ee76faf7 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:11:52 +0200 Subject: [PATCH 44/48] simplify logic --- src/utils/key_validator.py | 56 +++++--------------------------------- 1 file changed, 7 insertions(+), 49 deletions(-) diff --git a/src/utils/key_validator.py b/src/utils/key_validator.py index 69a1e04..a5a097b 100644 --- a/src/utils/key_validator.py +++ b/src/utils/key_validator.py @@ -6,8 +6,6 @@ import logging import re -from dataclasses import dataclass -from typing import Optional logger = logging.getLogger(__name__) @@ -18,34 +16,22 @@ class KeyValidationError(Exception): pass -@dataclass -class KeyValidationResult: - """Result of private key validation.""" - - is_valid: bool - formatted_key: Optional[str] - error_message: Optional[str] - - -def validate_private_key(private_key: str) -> KeyValidationResult: +def validate_and_format_private_key(private_key: str) -> str: """ - Validate and format a private key. + Validate and format a private key, raising an exception if invalid. + Ensures the key is a 64-character hex string and adds the '0x' prefix. Args: private_key: Raw private key string Returns: - KeyValidationResult object with validation status, formatted key, and error message + Formatted private key string Raises: KeyValidationError: If key validation fails """ if not private_key or not isinstance(private_key, str): - return KeyValidationResult( - is_valid=False, - formatted_key=None, - error_message="Private key must be a non-empty string", - ) + raise KeyValidationError("Private key must be a non-empty string") # Remove whitespace and common prefixes clean_key = private_key.strip() @@ -58,35 +44,7 @@ def validate_private_key(private_key: str) -> KeyValidationResult: # Validate hex format (64 characters) if not re.match(r"^[0-9a-fA-F]{64}$", hex_key): - return KeyValidationResult( - is_valid=False, - formatted_key=None, - error_message="Private key must be 64 hex characters", - ) + raise KeyValidationError("Private key must be 64 hex characters") # Return formatted key with 0x prefix - formatted_key = f"0x{hex_key.lower()}" - return KeyValidationResult( - is_valid=True, - formatted_key=formatted_key, - error_message=None, - ) - - -def validate_and_format_private_key(private_key: str) -> str: - """ - Validate and format a private key, raising an exception if invalid. - - Args: - private_key: Raw private key string - - Returns: - Formatted private key string - - Raises: - KeyValidationError: If key validation fails - """ - result = validate_private_key(private_key) - if not result.is_valid: - raise KeyValidationError(f"Invalid private key: {result.error_message}") - return result.formatted_key + return f"0x{hex_key.lower()}" From 23a1bdd41d00613f57d9e3d13a77e2316ce00784 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:13:07 +0200 Subject: [PATCH 45/48] Ruff --- src/models/bigquery_data_access_provider.py | 1 - src/models/eligibility_pipeline.py | 5 +- src/models/scheduler.py | 12 +++- src/models/service_quality_oracle.py | 9 +-- src/utils/configuration.py | 72 +++++++++++++-------- 5 files changed, 63 insertions(+), 36 deletions(-) diff --git a/src/models/bigquery_data_access_provider.py b/src/models/bigquery_data_access_provider.py index e16f86b..390950c 100644 --- a/src/models/bigquery_data_access_provider.py +++ b/src/models/bigquery_data_access_provider.py @@ -3,7 +3,6 @@ """ import logging -import os import socket from datetime import date from typing import cast diff --git a/src/models/eligibility_pipeline.py b/src/models/eligibility_pipeline.py index 7bd83a7..4aa87b1 100644 --- a/src/models/eligibility_pipeline.py +++ b/src/models/eligibility_pipeline.py @@ -33,9 +33,7 @@ def __init__(self, project_root: Path): self.output_dir = project_root / "data" / "output" - def process( - self, input_data_from_bigquery: pd.DataFrame, current_date: date - ) -> Tuple[List[str], List[str]]: + def process(self, input_data_from_bigquery: pd.DataFrame, current_date: date) -> Tuple[List[str], List[str]]: """ Process raw BigQuery data to generate data and return eligible indexer lists. @@ -66,6 +64,7 @@ def process( # 4. Return the lists of indexers return eligible_df["indexer"].tolist(), ineligible_df["indexer"].tolist() + def _generate_files( self, raw_data: pd.DataFrame, eligible_df: pd.DataFrame, ineligible_df: pd.DataFrame, output_date_dir: Path ) -> None: diff --git a/src/models/scheduler.py b/src/models/scheduler.py index 0c962b7..55440aa 100644 --- a/src/models/scheduler.py +++ b/src/models/scheduler.py @@ -26,10 +26,12 @@ class Scheduler: + def __init__(self): self.slack_notifier = None self.config = self.initialize() + def get_last_run_date(self): """ Get the date of the last successful run from a persistent file. @@ -57,6 +59,7 @@ def get_last_run_date(self): return last_run_date + def save_last_run_date(self, run_date): """Save the date of the last successful run to a file that we continuously overwrite each time""" try: @@ -66,6 +69,7 @@ def save_last_run_date(self, run_date): except Exception as e: logger.error(f"Error saving last run date: {e}") + def update_healthcheck(self, message=None): """Update the healthcheck file with current timestamp and optional message""" try: @@ -77,6 +81,7 @@ def update_healthcheck(self, message=None): except Exception as e: logger.warning(f"Failed to update healthcheck file: {e}") + @retry( stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=60, max=600), @@ -105,10 +110,13 @@ def run_oracle(self, run_date_override=None): self.save_last_run_date(run_date) end_time = datetime.now() duration_in_seconds = (end_time - start_time).total_seconds() - success_message = f"Scheduler successfully triggered oracle run for {run_date}. Duration: {duration_in_seconds:.2f}s" + success_message = ( + f"Scheduler successfully triggered oracle run for {run_date}. Duration: {duration_in_seconds:.2f}s" + ) logger.info(success_message) self.update_healthcheck(success_message) + def check_missed_runs(self): """Check if we missed any runs and execute them if needed""" today = datetime.now().date() @@ -138,6 +146,7 @@ def check_missed_runs(self): # The run_oracle method is decorated with @retry, so it will handle its own retries. self.run_oracle(run_date_override=yesterday) + def initialize(self): """Initialize the scheduler and validate configuration""" logger.info("Initializing scheduler...") @@ -187,6 +196,7 @@ def initialize(self): ) sys.exit(1) + def run(self): """Main loop for the scheduler""" logger.info("Scheduler started and waiting for scheduled runs") diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 47c629e..7c00a59 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -8,7 +8,6 @@ """ import logging -import os import sys import time from datetime import date, timedelta @@ -64,10 +63,12 @@ def main(run_date_override: date = None): # --- Data Fetching Stage --- stage = "Data Fetching from BigQuery" logger.info(f"Fetching data from {start_date} to {end_date}") - + # Construct the full table name from configuration - table_name = f"{config['BIGQUERY_PROJECT_ID']}.{config['BIGQUERY_DATASET_ID']}.{config['BIGQUERY_TABLE_ID']}" - + table_name = ( + f"{config['BIGQUERY_PROJECT_ID']}.{config['BIGQUERY_DATASET_ID']}.{config['BIGQUERY_TABLE_ID']}" + ) + bigquery_provider = BigQueryProvider( project=config["BIGQUERY_PROJECT_ID"], location=config["BIGQUERY_LOCATION"], diff --git a/src/utils/configuration.py b/src/utils/configuration.py index 57fcdbc..1aa062e 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -22,11 +22,13 @@ class ConfigurationError(Exception): """Raised when configuration loading or validation fails.""" + pass # --- Configuration Loading --- + class ConfigLoader: """Internal class to load configuration from TOML and environment variables.""" @@ -42,7 +44,7 @@ def _get_default_config_path(self) -> str: docker_path = Path("/app/config.toml") if docker_path.exists(): return str(docker_path) - + # For local development, look in project root current_path = Path(__file__).parent while current_path != current_path.parent: @@ -50,7 +52,7 @@ def _get_default_config_path(self) -> str: if config_path.exists(): return str(config_path) current_path = current_path.parent - + raise ConfigurationError("Could not find config.toml in project root or Docker container") @@ -73,23 +75,22 @@ def _substitute_env_vars(self, config_toml: Any) -> Any: # Find all environment variable references env_vars = self._env_var_pattern.findall(config_toml) - for env_var in env_vars: env_value = os.getenv(env_var) if env_value is None: raise ConfigurationError(f"Required environment variable {env_var} is not set") - + # Replace the environment variable reference with actual value config_toml = config_toml.replace(f"${env_var}", env_value) return config_toml - + elif isinstance(config_toml, dict): return {k: self._substitute_env_vars(v) for k, v in config_toml.items()} - + elif isinstance(config_toml, list): return [self._substitute_env_vars(item) for item in config_toml] - + return config_toml @@ -120,7 +121,7 @@ def get_flat_config(self) -> dict[str, Any]: """ raw_config = self._get_raw_config() substituted_config = self._substitute_env_vars(raw_config) - + # fmt: off # Convert nested structure to flat format return { @@ -129,13 +130,13 @@ def get_flat_config(self) -> dict[str, Any]: "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID"), "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID"), "BIGQUERY_TABLE_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_TABLE_ID"), - + # Eligibility Criteria "MIN_ONLINE_DAYS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS")), "MIN_SUBGRAPHS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS")), "MAX_LATENCY_MS": int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS")), "MAX_BLOCKS_BEHIND": int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND")), - + # Blockchain settings "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), "CONTRACT_FUNCTION": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), @@ -143,18 +144,18 @@ def get_flat_config(self) -> dict[str, Any]: "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS")), "BLOCK_EXPLORER_URL": substituted_config.get("blockchain", {}).get("BLOCK_EXPLORER_URL"), "TX_TIMEOUT_SECONDS": int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS")), - + # Scheduling "SCHEDULED_RUN_TIME": substituted_config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), - + # Subgraph URLs "SUBGRAPH_URL": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), - + # Processing settings "BATCH_SIZE": int(substituted_config.get("processing", {}).get("BATCH_SIZE")), "MAX_AGE_BEFORE_DELETION": int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION")), "BIGQUERY_ANALYSIS_PERIOD_DAYS": int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS")), - + # Secrets "GOOGLE_APPLICATION_CREDENTIALS": substituted_config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), "PRIVATE_KEY": substituted_config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), @@ -214,11 +215,24 @@ def get_missing_env_vars(self) -> list[str]: def _validate_config(config: dict[str, Any]) -> dict[str, Any]: # Define required fields. All other fields from `get_flat_config` are considered optional. required = [ - "BIGQUERY_LOCATION", "BIGQUERY_PROJECT_ID", "BIGQUERY_DATASET_ID", "BIGQUERY_TABLE_ID", - "MIN_ONLINE_DAYS", "MIN_SUBGRAPHS", "MAX_LATENCY_MS", "MAX_BLOCKS_BEHIND", - "CONTRACT_ADDRESS", "CONTRACT_FUNCTION", "CHAIN_ID", "RPC_PROVIDERS", - "BLOCK_EXPLORER_URL", "TX_TIMEOUT_SECONDS", "SCHEDULED_RUN_TIME", - "BATCH_SIZE", "MAX_AGE_BEFORE_DELETION", "BIGQUERY_ANALYSIS_PERIOD_DAYS", + "BIGQUERY_LOCATION", + "BIGQUERY_PROJECT_ID", + "BIGQUERY_DATASET_ID", + "BIGQUERY_TABLE_ID", + "MIN_ONLINE_DAYS", + "MIN_SUBGRAPHS", + "MAX_LATENCY_MS", + "MAX_BLOCKS_BEHIND", + "CONTRACT_ADDRESS", + "CONTRACT_FUNCTION", + "CHAIN_ID", + "RPC_PROVIDERS", + "BLOCK_EXPLORER_URL", + "TX_TIMEOUT_SECONDS", + "SCHEDULED_RUN_TIME", + "BATCH_SIZE", + "MAX_AGE_BEFORE_DELETION", + "BIGQUERY_ANALYSIS_PERIOD_DAYS", "PRIVATE_KEY", ] missing = [field for field in required if config.get(field) is None or config.get(field) == []] @@ -232,7 +246,9 @@ def _validate_config(config: dict[str, Any]) -> dict[str, Any]: # The int() casts in get_flat_config will handle type errors for numeric fields. datetime.strptime(config["SCHEDULED_RUN_TIME"], "%H:%M") except (ValueError, TypeError): - raise ConfigurationError(f"Invalid SCHEDULED_RUN_TIME: {config['SCHEDULED_RUN_TIME']} - must be in HH:MM format.") + raise ConfigurationError( + f"Invalid SCHEDULED_RUN_TIME: {config['SCHEDULED_RUN_TIME']} - must be in HH:MM format." + ) return config @@ -256,6 +272,7 @@ def validate_all_required_env_vars() -> None: # --- Credential Management --- + class CredentialManager: """Handles credential management for Google Cloud services.""" @@ -303,7 +320,7 @@ def _setup_user_credentials_from_dict(self, creds_data: dict) -> None: """Set up user account credentials directly from a dictionary.""" import google.auth from google.oauth2.credentials import Credentials - + # Try to set up the credentials try: credentials = Credentials( @@ -315,7 +332,7 @@ def _setup_user_credentials_from_dict(self, creds_data: dict) -> None: ) # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] + google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] logger.info("Successfully loaded user account credentials from environment variable") # Clear credentials from memory @@ -337,7 +354,7 @@ def _setup_service_account_credentials_from_dict(self, creds_data: dict) -> None # Set credentials globally for GCP libraries google.auth._default._CREDENTIALS = credentials logger.info("Successfully loaded service account credentials from environment variable") - + # If the credentials creation fails, raise an error except Exception as e: raise ValueError(f"Invalid service account credentials: {e}") from e @@ -379,16 +396,17 @@ def setup_google_credentials(self) -> None: # If the credentials parsing fails, raise an error except Exception as e: raise ValueError(f"Error processing inline credentials: {e}") from e - + # Clear the credentials from memory finally: if creds_data: creds_data.clear() - + # Case 2: File path provided elif not os.path.exists(creds_env): - logger.warning(f"GOOGLE_APPLICATION_CREDENTIALS is not valid JSON or a file path.") + logger.warning("GOOGLE_APPLICATION_CREDENTIALS is not valid JSON or a file path.") logger.warning("Falling back to gcloud CLI authentication if available.") + # Global instance for easy access -credential_manager = CredentialManager() \ No newline at end of file +credential_manager = CredentialManager() From b862dd5aaee70832d591400938d60df53aa00942 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:22:34 +0200 Subject: [PATCH 46/48] Update README.md --- README.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3b8963d..bbf5fb6 100644 --- a/README.md +++ b/README.md @@ -47,12 +47,19 @@ Please refer to the [ELIGIBILITY_CRITERIA.md](./ELIGIBILITY_CRITERIA.md) file to ## Data Flow -The application follows this data flow: +The application follows a clear data flow, managed by a daily scheduler: -1. **BigQuery Data Acquisition**: The `bigquery_fetch_and_save_indexer_issuance_eligibility_data_finally_return_eligible_indexers` function in `issuance_data_access_helper.py` fetches fresh data from BigQuery, processes it to determine eligibility, and returns the eligibility data list that would then be posted on chain. - - This function also ensures that data is saved to local files in dated directories for auditing/historical reference over the data retention period. +1. **Scheduler (`scheduler.py`)**: This is the main entry point. It runs on a schedule (e.g., daily), manages the application lifecycle, and triggers the oracle run. It is also responsible for catching up on any missed runs. -2. **Blockchain Publication**: The eligible indexers list from step 1 is directly posted on-chain to a smart contract. Batching of transactions is performed if necessary. +2. **Orchestrator (`service_quality_oracle.py`)**: For each run, this module orchestrates the end-to-end process by coordinating the other components. + +3. **Data Fetching (`bigquery_data_access_provider.py`)**: The orchestrator calls this provider to execute a configurable SQL query against Google BigQuery, fetching the raw indexer performance data. + +4. **Data Processing (`eligibility_pipeline.py`)**: The raw data is passed to this module, which processes it, filters for eligible and ineligible indexers, and generates CSV artifacts for auditing and record-keeping. + +5. **Blockchain Submission (`blockchain_client.py`)**: The orchestrator takes the final list of eligible indexers and passes it to this client, which handles the complexities of batching, signing, and sending the transaction to the blockchain via RPC providers with built-in failover. + +6. **Notifications (`slack_notifier.py`)**: Throughout the process, status updates (success, failure, warnings) are sent to Slack. ## CI/CD Pipeline From 4968619ed34c2850a4141bcddbd6f868fc1453f9 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Tue, 10 Jun 2025 21:29:29 +0200 Subject: [PATCH 47/48] Fix CI/CD errors, update workflow, create tests placeholders ruff Update configuration.py Fix CI error --- .github/workflows/ci.yml | 54 ++++++++++++++++++++----- .github/workflows/tests.yml | 3 +- Dockerfile | 12 +++--- config.toml.example | 8 ++-- src/models/blockchain_client.py | 36 +++++++++++------ src/models/service_quality_oracle.py | 10 ++--- src/utils/configuration.py | 59 ++++++++++++++++++---------- src/utils/slack_notifier.py | 2 +- tests/placeholder.py | 0 tests/test_blockchain_client.py | 8 ++++ tests/test_configuration.py | 8 ++++ tests/test_eligibility_pipeline.py | 8 ++++ tests/test_scheduler.py | 7 ++++ tests/test_service_quality_oracle.py | 7 ++++ 14 files changed, 163 insertions(+), 59 deletions(-) delete mode 100644 tests/placeholder.py create mode 100644 tests/test_blockchain_client.py create mode 100644 tests/test_configuration.py create mode 100644 tests/test_eligibility_pipeline.py create mode 100644 tests/test_scheduler.py create mode 100644 tests/test_service_quality_oracle.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2850e3..932e54a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -66,7 +66,7 @@ jobs: cd src python -c " import sys; sys.path.insert(0, '..') - from src.utils.config_loader import load_config + from src.utils.configuration import load_config from src.utils.key_validator import validate_and_format_private_key print('Core modules import successfully') " @@ -74,14 +74,50 @@ jobs: - name: Validate configuration run: | python -c " - import tomli - with open('config.toml.example', 'rb') as f: - config = tomli.load(f) - required = ['bigquery', 'blockchain', 'scheduling', 'secrets'] - for section in required: - if section not in config: - raise ValueError(f'Missing section: {section}') - print('Configuration valid') + import sys + import os + + # Add project root to path + sys.path.insert(0, '.') + + os.environ['BLOCKCHAIN_PRIVATE_KEY'] = '0x' + 'f' * 64 + os.environ['SLACK_WEBHOOK_URL'] = 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX' + os.environ['STUDIO_API_KEY'] = 'api-key' + os.environ['STUDIO_DEPLOY_KEY'] = 'deploy-key' + os.environ['ARBITRUM_API_KEY'] = 'api-key' + os.environ['ETHERSCAN_API_KEY'] = 'api-key' + os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '{}' + + from src.utils.configuration import ConfigLoader, _validate_config + + print('Validating config.toml.example...') + + # Use the example file and run the full validation logic from our application + loader = ConfigLoader(config_path='config.toml.example') + config = loader.get_flat_config() + + print('Patching config in-memory with dummy data for validation...') + config_to_validate = config.copy() + config_to_validate.update({ + 'BIGQUERY_LOCATION_ID': 'dummy-location', + 'BIGQUERY_PROJECT_ID': 'dummy-project', + 'BIGQUERY_DATASET_ID': 'dummy-dataset', + 'BIGQUERY_TABLE_ID': 'dummy-table', + 'BLOCKCHAIN_CONTRACT_ADDRESS': '0x' + '0' * 40, + 'BLOCKCHAIN_FUNCTION_NAME': 'dummyFunction', + 'BLOCKCHAIN_CHAIN_ID': 1, + 'BLOCKCHAIN_RPC_URLS': ['http://dummy-rpc.com'], + 'SUBGRAPH_URL_PRE_PRODUCTION': 'http://dummy-subgraph.com', + 'SUBGRAPH_URL_PRODUCTION': 'http://dummy-subgraph.com', + 'SCHEDULED_RUN_TIME': '00:00', + 'BATCH_SIZE': 100, + 'MAX_AGE_BEFORE_DELETION': 100, + 'BIGQUERY_ANALYSIS_PERIOD_DAYS': 100, + }) + + _validate_config(config_to_validate) + + print('config.toml.example is structurally valid.') " # ============================================================================= diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 693174c..acda954 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,7 +34,8 @@ jobs: run: | if [ -d "tests" ] && [ "$(find tests -name "test_*.py" -o -name "*_test.py" | wc -l)" -gt 0 ]; then echo "Running tests" - pytest tests/ -v --cov=src --cov-report=term-missing -p no:ethereum + # Run pytest and allow exit code 5 (no tests found), but fail on any other error + pytest tests/ -v --cov=src --cov-report=term-missing -p no:ethereum || ([ $? -eq 5 ] && echo "Pytest exited with 5 (No tests found), which is expected. Passing." || exit $?) else echo "No tests found. Test directory is empty or doesn't contain test files." echo "Tests will be skipped until test files are added." diff --git a/Dockerfile b/Dockerfile index 669d65a..a7b1126 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,13 +11,11 @@ LABEL description="Service Quality Oracle" \ WORKDIR /app -""" -Setup enviroment variables: - 1. PYTHONDONTWRITEBYTECODE=1 - Prevent python from creating .pyc files - 2. PYTHONUNBUFFERED=1 - Send logs direct to console without buffering - 3. PYTHONPATH=/app - Add app directory to python import path - 4. TZ=UTC - Set timezone to UTC -""" +# Setup enviroment variables: +# 1. PYTHONDONTWRITEBYTECODE=1 - Prevent python from creating .pyc files +# 2. PYTHONUNBUFFERED=1 - Send logs direct to console without buffering +# 3. PYTHONPATH=/app - Add app directory to python import path +# 4. TZ=UTC - Set timezone to UTC ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PYTHONPATH=/app \ diff --git a/config.toml.example b/config.toml.example index 0a4a505..770a48f 100644 --- a/config.toml.example +++ b/config.toml.example @@ -6,10 +6,10 @@ # ============================================================================= [bigquery] -BIGQUERY_LOCATION_ID = "" -BIGQUERY_PROJECT_ID = "" -BIGQUERY_DATASET_ID = "" -BIGQUERY_TABLE_ID = "" +BIGQUERY_LOCATION_ID = "US" +BIGQUERY_PROJECT_ID = "graph-mainnet" +BIGQUERY_DATASET_ID = "internal_metrics" +BIGQUERY_TABLE_ID = "metrics_indexer_attempts" [blockchain] BLOCKCHAIN_CONTRACT_ADDRESS = "" diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index 7ebaa97..9e8cf78 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -11,10 +11,12 @@ import json import logging from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable, Dict, List, Tuple, cast from web3 import Web3 from web3.contract import Contract +from web3.providers import HTTPProvider +from web3.types import BlockData, ChecksumAddress, HexStr from src.utils.key_validator import KeyValidationError, validate_and_format_private_key from src.utils.retry_decorator import retry_with_backoff @@ -136,7 +138,12 @@ def _setup_transaction_account(self, private_key: str) -> str: def _estimate_transaction_gas( - self, w3: Web3, contract_func: Any, indexer_addresses: List[str], data_bytes: bytes, sender_address: str + self, + w3: Web3, + contract_func: Any, + indexer_addresses: List[str], + data_bytes: bytes, + sender_address: ChecksumAddress, ) -> int: """ Estimate gas for the transaction with 25% buffer. @@ -164,7 +171,7 @@ def _estimate_transaction_gas( raise - def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: bool) -> int: + def _determine_transaction_nonce(self, w3: Web3, sender_address: ChecksumAddress, replace: bool) -> int: """ Determine the appropriate nonce for the transaction. @@ -187,9 +194,12 @@ def _determine_transaction_nonce(self, w3: Web3, sender_address: str, replace: b # Try to find pending transactions try: - pending_txs = w3.eth.get_block("pending", full_transactions=True) + pending_txs_data = w3.eth.get_block("pending", full_transactions=True) + pending_txs = cast(BlockData, pending_txs_data) sender_pending_txs = [ - tx for tx in pending_txs.transactions if hasattr(tx, "from") and tx["from"] == sender_address + tx + for tx in pending_txs["transactions"] + if isinstance(tx, dict) and tx.get("from") == sender_address ] # If we found pending transactions, use the nonce of the first pending transaction @@ -225,8 +235,10 @@ def _get_gas_prices(self, w3: Web3, replace: bool) -> Tuple[int, int]: """Get base fee and max priority fee for transaction.""" # Get current gas prices with detailed logging try: - latest_block = w3.eth.get_block("latest") - base_fee = latest_block["baseFeePerGas"] + latest_block_data = w3.eth.get_block("latest") + latest_block = cast(BlockData, latest_block_data) + base_fee_hex = latest_block["baseFeePerGas"] + base_fee = int(base_fee_hex) logger.info(f"Latest block base fee: {base_fee/1e9:.2f} gwei") # If the base fee cannot be retrieved, use a fallback value @@ -250,7 +262,7 @@ def _get_gas_prices(self, w3: Web3, replace: bool) -> Tuple[int, int]: def _build_transaction_params( self, - sender_address: str, + sender_address: ChecksumAddress, nonce: int, chain_id: int, gas_limit: int, @@ -379,9 +391,10 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di contract_function = params["contract_function"] indexer_addresses = params["indexer_addresses"] data_bytes = params["data_bytes"] - sender_address = params["sender_address"] + sender_address_str = params["sender_address"] chain_id = params["chain_id"] replace = params["replace"] + sender_address = Web3.to_checksum_address(sender_address_str) # Validate contract function exists if not hasattr(contract.functions, contract_function): @@ -396,7 +409,8 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di logger.info(f"Data bytes length: {len(data_bytes)}") logger.info(f"Chain ID: {chain_id}") logger.info(f"Sender address: {sender_address}") - logger.info(f"Using RPC: {w3.provider.endpoint_uri}") + if isinstance(w3.provider, HTTPProvider): + logger.info(f"Using RPC: {w3.provider.endpoint_uri}") # Check account balance balance_wei = w3.eth.get_balance(sender_address) @@ -434,7 +448,7 @@ def _execute_complete_transaction(self, w3: Web3, contract: Contract, params: Di # Wait for receipt with the same connection try: - tx_receipt = w3.eth.wait_for_transaction_receipt(tx_hash, timeout=self.tx_timeout_seconds) + tx_receipt = w3.eth.wait_for_transaction_receipt(HexStr(tx_hash), timeout=self.tx_timeout_seconds) if tx_receipt["status"] == 1: logger.info( f"Transaction confirmed in block {tx_receipt['blockNumber']}, " diff --git a/src/models/service_quality_oracle.py b/src/models/service_quality_oracle.py index 7c00a59..51573f8 100644 --- a/src/models/service_quality_oracle.py +++ b/src/models/service_quality_oracle.py @@ -71,7 +71,7 @@ def main(run_date_override: date = None): bigquery_provider = BigQueryProvider( project=config["BIGQUERY_PROJECT_ID"], - location=config["BIGQUERY_LOCATION"], + location=config["BIGQUERY_LOCATION_ID"], table_name=table_name, min_online_days=config["MIN_ONLINE_DAYS"], min_subgraphs=config["MIN_SUBGRAPHS"], @@ -96,8 +96,8 @@ def main(run_date_override: date = None): stage = "Blockchain Submission" logger.info("Instantiating BlockchainClient...") blockchain_client = BlockchainClient( - rpc_providers=config["RPC_PROVIDERS"], - contract_address=config["CONTRACT_ADDRESS"], + rpc_providers=config["BLOCKCHAIN_RPC_URLS"], + contract_address=config["BLOCKCHAIN_CONTRACT_ADDRESS"], project_root=project_root_path, block_explorer_url=config["BLOCK_EXPLORER_URL"], tx_timeout_seconds=config["TX_TIMEOUT_SECONDS"], @@ -105,8 +105,8 @@ def main(run_date_override: date = None): transaction_links = blockchain_client.batch_allow_indexers_issuance_eligibility( indexer_addresses=eligible_indexers, private_key=config["PRIVATE_KEY"], - chain_id=config["CHAIN_ID"], - contract_function=config["CONTRACT_FUNCTION"], + chain_id=config["BLOCKCHAIN_CHAIN_ID"], + contract_function=config["BLOCKCHAIN_FUNCTION_NAME"], batch_size=config["BATCH_SIZE"], replace=True, ) diff --git a/src/utils/configuration.py b/src/utils/configuration.py index 1aa062e..2a1ad07 100644 --- a/src/utils/configuration.py +++ b/src/utils/configuration.py @@ -122,45 +122,55 @@ def get_flat_config(self) -> dict[str, Any]: raw_config = self._get_raw_config() substituted_config = self._substitute_env_vars(raw_config) + # Helper to safely convert values to integers + + + def to_int(v): + return int(v) if v is not None and v != "" else None + # fmt: off # Convert nested structure to flat format return { # BigQuery settings - "BIGQUERY_LOCATION": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID"), + "BIGQUERY_LOCATION_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_LOCATION_ID"), "BIGQUERY_PROJECT_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_PROJECT_ID"), "BIGQUERY_DATASET_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_DATASET_ID"), "BIGQUERY_TABLE_ID": substituted_config.get("bigquery", {}).get("BIGQUERY_TABLE_ID"), # Eligibility Criteria - "MIN_ONLINE_DAYS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS")), - "MIN_SUBGRAPHS": int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS")), - "MAX_LATENCY_MS": int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS")), - "MAX_BLOCKS_BEHIND": int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND")), + "MIN_ONLINE_DAYS": to_int(substituted_config.get("eligibility_criteria", {}).get("MIN_ONLINE_DAYS")), + "MIN_SUBGRAPHS": to_int(substituted_config.get("eligibility_criteria", {}).get("MIN_SUBGRAPHS")), + "MAX_LATENCY_MS": to_int(substituted_config.get("eligibility_criteria", {}).get("MAX_LATENCY_MS")), + "MAX_BLOCKS_BEHIND": to_int(substituted_config.get("eligibility_criteria", {}).get("MAX_BLOCKS_BEHIND")), # Blockchain settings - "CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), - "CONTRACT_FUNCTION": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), - "CHAIN_ID": int(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID")), - "RPC_PROVIDERS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS")), + "BLOCKCHAIN_CONTRACT_ADDRESS": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CONTRACT_ADDRESS"), + "BLOCKCHAIN_FUNCTION_NAME": substituted_config.get("blockchain", {}).get("BLOCKCHAIN_FUNCTION_NAME"), + "BLOCKCHAIN_CHAIN_ID": to_int(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_CHAIN_ID")), + "BLOCKCHAIN_RPC_URLS": self._parse_rpc_urls(substituted_config.get("blockchain", {}).get("BLOCKCHAIN_RPC_URLS")), "BLOCK_EXPLORER_URL": substituted_config.get("blockchain", {}).get("BLOCK_EXPLORER_URL"), - "TX_TIMEOUT_SECONDS": int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS")), + "TX_TIMEOUT_SECONDS": to_int(substituted_config.get("blockchain", {}).get("TX_TIMEOUT_SECONDS")), # Scheduling "SCHEDULED_RUN_TIME": substituted_config.get("scheduling", {}).get("SCHEDULED_RUN_TIME"), # Subgraph URLs - "SUBGRAPH_URL": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), + "SUBGRAPH_URL_PRE_PRODUCTION": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRE_PRODUCTION"), + "SUBGRAPH_URL_PRODUCTION": substituted_config.get("subgraph", {}).get("SUBGRAPH_URL_PRODUCTION"), # Processing settings - "BATCH_SIZE": int(substituted_config.get("processing", {}).get("BATCH_SIZE")), - "MAX_AGE_BEFORE_DELETION": int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION")), - "BIGQUERY_ANALYSIS_PERIOD_DAYS": int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS")), + "BATCH_SIZE": to_int(substituted_config.get("processing", {}).get("BATCH_SIZE")), + "MAX_AGE_BEFORE_DELETION": to_int(substituted_config.get("processing", {}).get("MAX_AGE_BEFORE_DELETION")), + "BIGQUERY_ANALYSIS_PERIOD_DAYS": to_int(substituted_config.get("processing", {}).get("BIGQUERY_ANALYSIS_PERIOD_DAYS")), # Secrets "GOOGLE_APPLICATION_CREDENTIALS": substituted_config.get("secrets", {}).get("GOOGLE_APPLICATION_CREDENTIALS"), "PRIVATE_KEY": substituted_config.get("secrets", {}).get("BLOCKCHAIN_PRIVATE_KEY"), "STUDIO_API_KEY": substituted_config.get("secrets", {}).get("STUDIO_API_KEY"), + "STUDIO_DEPLOY_KEY": substituted_config.get("secrets", {}).get("STUDIO_DEPLOY_KEY"), "SLACK_WEBHOOK_URL": substituted_config.get("secrets", {}).get("SLACK_WEBHOOK_URL"), + "ETHERSCAN_API_KEY": substituted_config.get("secrets", {}).get("ETHERSCAN_API_KEY"), + "ARBITRUM_API_KEY": substituted_config.get("secrets", {}).get("ARBITRUM_API_KEY"), } # fmt: on @@ -215,7 +225,7 @@ def get_missing_env_vars(self) -> list[str]: def _validate_config(config: dict[str, Any]) -> dict[str, Any]: # Define required fields. All other fields from `get_flat_config` are considered optional. required = [ - "BIGQUERY_LOCATION", + "BIGQUERY_LOCATION_ID", "BIGQUERY_PROJECT_ID", "BIGQUERY_DATASET_ID", "BIGQUERY_TABLE_ID", @@ -223,19 +233,26 @@ def _validate_config(config: dict[str, Any]) -> dict[str, Any]: "MIN_SUBGRAPHS", "MAX_LATENCY_MS", "MAX_BLOCKS_BEHIND", - "CONTRACT_ADDRESS", - "CONTRACT_FUNCTION", - "CHAIN_ID", - "RPC_PROVIDERS", + "BLOCKCHAIN_CONTRACT_ADDRESS", + "BLOCKCHAIN_FUNCTION_NAME", + "BLOCKCHAIN_CHAIN_ID", + "BLOCKCHAIN_RPC_URLS", "BLOCK_EXPLORER_URL", "TX_TIMEOUT_SECONDS", "SCHEDULED_RUN_TIME", + "SUBGRAPH_URL_PRE_PRODUCTION", + "SUBGRAPH_URL_PRODUCTION", "BATCH_SIZE", "MAX_AGE_BEFORE_DELETION", "BIGQUERY_ANALYSIS_PERIOD_DAYS", "PRIVATE_KEY", + "STUDIO_API_KEY", + "STUDIO_DEPLOY_KEY", + "SLACK_WEBHOOK_URL", + "ETHERSCAN_API_KEY", + "ARBITRUM_API_KEY", ] - missing = [field for field in required if config.get(field) is None or config.get(field) == []] + missing = [field for field in required if not config.get(field)] if missing: raise ConfigurationError( f"Missing required configuration fields in config.toml or environment variables: {', '.join(sorted(missing))}" @@ -352,7 +369,7 @@ def _setup_service_account_credentials_from_dict(self, creds_data: dict) -> None credentials = service_account.Credentials.from_service_account_info(creds_data) # Set credentials globally for GCP libraries - google.auth._default._CREDENTIALS = credentials + google.auth._default._CREDENTIALS = credentials # type: ignore[attr-defined] logger.info("Successfully loaded service account credentials from environment variable") # If the credentials creation fails, raise an error diff --git a/src/utils/slack_notifier.py b/src/utils/slack_notifier.py index ff0ef9a..20921b1 100644 --- a/src/utils/slack_notifier.py +++ b/src/utils/slack_notifier.py @@ -7,7 +7,7 @@ from datetime import datetime from typing import Dict, List, Optional -import requests +import requests # type: ignore[import-untyped] from src.utils.retry_decorator import retry_with_backoff diff --git a/tests/placeholder.py b/tests/placeholder.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_blockchain_client.py b/tests/test_blockchain_client.py new file mode 100644 index 0000000..d5c021e --- /dev/null +++ b/tests/test_blockchain_client.py @@ -0,0 +1,8 @@ +""" +Unit tests for the BlockchainClient. +""" + +# TODO: Add tests for RPC failover +# TODO: Add tests for gas estimation +# TODO: Add tests for nonce management +# TODO: Add tests for transaction batching diff --git a/tests/test_configuration.py b/tests/test_configuration.py new file mode 100644 index 0000000..275cc26 --- /dev/null +++ b/tests/test_configuration.py @@ -0,0 +1,8 @@ +""" +Unit tests for the configuration loader and validator. +""" + +# TODO: Add test for successful config loading +# TODO: Add test for missing required config value (should raise ConfigurationError) +# TODO: Add test for invalid config value (e.g., bad time format) +# TODO: Add test for environment variable substitution diff --git a/tests/test_eligibility_pipeline.py b/tests/test_eligibility_pipeline.py new file mode 100644 index 0000000..ccaab92 --- /dev/null +++ b/tests/test_eligibility_pipeline.py @@ -0,0 +1,8 @@ +""" +Unit tests for the EligibilityPipeline. +""" + +# TODO: Add test for data processing logic +# TODO: Add test for CSV artifact generation +# TODO: Add test for date directory cleanup +# TODO: Add test for dataframe validation diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py new file mode 100644 index 0000000..826138a --- /dev/null +++ b/tests/test_scheduler.py @@ -0,0 +1,7 @@ +""" +Unit tests for the Scheduler. +""" + +# TODO: Add test for scheduled job execution +# TODO: Add test for missed run detection and execution +# TODO: Add test for healthcheck file updates diff --git a/tests/test_service_quality_oracle.py b/tests/test_service_quality_oracle.py new file mode 100644 index 0000000..287bc84 --- /dev/null +++ b/tests/test_service_quality_oracle.py @@ -0,0 +1,7 @@ +""" +Unit tests for the main ServiceQualityOracle orchestrator. +""" + +# TODO: Add end-to-end test with mocked dependencies +# TODO: Add test for successful run notification +# TODO: Add test for failure run notification From 617a008fa57db5d6fc7ebaf21f4b33e009239a40 Mon Sep 17 00:00:00 2001 From: MoonBoi9001 Date: Wed, 11 Jun 2025 13:28:33 +0200 Subject: [PATCH 48/48] move key validation into _setup_transaction_account --- src/models/blockchain_client.py | 92 ++++++++++++++++----------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/src/models/blockchain_client.py b/src/models/blockchain_client.py index 9e8cf78..8b0e435 100644 --- a/src/models/blockchain_client.py +++ b/src/models/blockchain_client.py @@ -116,22 +116,29 @@ def _get_working_web3_connection( raise ConnectionError(f"Failed to connect to any of {len(rpc_providers)} RPC providers: {rpc_providers}") - def _setup_transaction_account(self, private_key: str) -> str: + def _setup_transaction_account(self, private_key: str) -> Tuple[str, str]: """ - Get the address of the account from the private key. + Validate the private key and return the formatted key and account address. Args: - private_key: Private key for the account + private_key: The private key string. Returns: - str: Address of the account + A tuple containing the account address and the formatted private key. + + Raises: + KeyValidationError: If the private key is invalid. """ try: - account = Web3().eth.account.from_key(private_key) + formatted_key = validate_and_format_private_key(private_key) + account = Web3().eth.account.from_key(formatted_key) logger.info(f"Using account: {account.address}") - return account.address + return account.address, formatted_key + + except KeyValidationError as e: + logger.error(f"Invalid private key provided: {e}") + raise - # If the account cannot be retrieved, log the error and raise an exception except Exception as e: logger.error(f"Failed to retrieve account from private key: {str(e)}") raise @@ -530,15 +537,15 @@ def send_transaction_to_allow_indexers( Returns: str: Transaction hash """ - # Set up account - sender_address = self._setup_transaction_account(private_key) + # Set up account and validate private key + sender_address, formatted_private_key = self._setup_transaction_account(private_key) # Convert addresses to checksum format checksum_addresses = [Web3.to_checksum_address(addr) for addr in indexer_addresses] # Prepare all parameters for the transaction transaction_params = { - "private_key": private_key, + "private_key": formatted_private_key, "contract_function": contract_function, "indexer_addresses": checksum_addresses, "data_bytes": data_bytes, @@ -596,43 +603,36 @@ def batch_allow_indexers_issuance_eligibility( num_batches = (total_indexers + batch_size - 1) // batch_size logger.info(f"Processing {total_indexers} indexers in {num_batches} batch(es) of {batch_size}") - try: - tx_links = [] - # Validate and format private key - validated_private_key = validate_and_format_private_key(private_key) - - # Process each batch - for i in range(num_batches): - start_idx = i * batch_size - end_idx = min(start_idx + batch_size, total_indexers) - batch_indexers = indexer_addresses[start_idx:end_idx] - - logger.info(f"Processing batch {i+1}/{num_batches} with {len(batch_indexers)} indexers") - - # Try to send the transaction to the network (uses RPC failover) - try: - tx_hash = self.send_transaction_to_allow_indexers( - batch_indexers, - validated_private_key, - chain_id, - contract_function, - replace, - data_bytes, - ) - tx_links.append(f"{self.block_explorer_url}/tx/{tx_hash}") - logger.info(f"Batch {i+1} transaction successful: {tx_hash}") + tx_links = [] - # If we get an error, log the error and raise an exception - except Exception as e: - logger.error(f"Error processing batch {i+1} due to: {e}") - raise + # Process each batch + for i in range(num_batches): + start_idx = i * batch_size + end_idx = min(start_idx + batch_size, total_indexers) + batch_indexers = indexer_addresses[start_idx:end_idx] - # Log all transaction links - for i, tx_link in enumerate(tx_links, 1): - logger.info(f"Transaction link {i} of {len(tx_links)}: {tx_link}") + logger.info(f"Processing batch {i+1}/{num_batches} with {len(batch_indexers)} indexers") - return tx_links + # Try to send the transaction to the network (uses RPC failover) + try: + tx_hash = self.send_transaction_to_allow_indexers( + batch_indexers, + private_key, + chain_id, + contract_function, + replace, + data_bytes, + ) + tx_links.append(f"{self.block_explorer_url}/tx/{tx_hash}") + logger.info(f"Batch {i+1} transaction successful: {tx_hash}") - except KeyValidationError as e: - logger.error(f"Private key validation failed: {e}") - raise ValueError(f"Invalid private key: {e}") from e + # If we get an error, log the error and raise an exception + except Exception as e: + logger.error(f"Error processing batch {i+1} due to: {e}") + raise + + # Log all transaction links + for i, tx_link in enumerate(tx_links, 1): + logger.info(f"Transaction link {i} of {len(tx_links)}: {tx_link}") + + return tx_links