diff --git a/grafana/alerts-rustchain.yml b/grafana/alerts-rustchain.yml new file mode 100644 index 00000000..f3933ef6 --- /dev/null +++ b/grafana/alerts-rustchain.yml @@ -0,0 +1,83 @@ +groups: + - name: rustchain_alerts + interval: 30s + rules: + # Alert: Node is down + - alert: NodeDown + expr: rustchain_node_up == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "RustChain node is down" + description: "RustChain node has been down for more than 1 minute" + + # Alert: Epoch stuck (no progress for 15 minutes = 1.5 epochs) + - alert: EpochStuck + expr: (time() - rustchain_epoch_current) > 900 + for: 5m + labels: + severity: warning + annotations: + summary: "Epoch progress stuck" + description: "Epoch has not progressed in 5 minutes" + + # Alert: Database growing too fast + - alert: DatabaseGrowth + expr: rate(rustchain_db_size_bytes[1h]) > 1073741824 # > 1GB/hour + for: 10m + labels: + severity: warning + annotations: + summary: "Database growing too fast" + description: "Database is growing faster than 1GB/hour" + + # Alert: No active miners + - alert: NoActiveMiners + expr: rustchain_miners_active == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "No active miners" + description: "No miners have attested in the last 2 epochs" + + # Alert: Low enrolled miners + - alert: LowEnrolledMiners + expr: rustchain_epoch_enrolled_miners < 5 + for: 10m + labels: + severity: warning + annotations: + summary: "Low miner enrollment" + description: "Less than 5 miners enrolled in current epoch" + + # Alert: High API latency + - alert: HighAPILatency + expr: histogram_quantile(0.95, rate(rustchain_api_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High API latency" + description: "API p95 latency is above 5 seconds" + + # Alert: Backup too old + - alert: BackupTooOld + expr: rustchain_backup_age_hours > 24 + for: 1h + labels: + severity: warning + annotations: + summary: "Backup is too old" + description: "Last backup is more than 24 hours old" + + # Alert: Node restarted recently + - alert: NodeRestarted + expr: rustchain_node_uptime_seconds < 300 + for: 1m + labels: + severity: info + annotations: + summary: "Node recently restarted" + description: "Node has been up for less than 5 minutes" diff --git a/grafana/dashboard-rustchain.json b/grafana/dashboard-rustchain.json new file mode 100644 index 00000000..86fcf478 --- /dev/null +++ b/grafana/dashboard-rustchain.json @@ -0,0 +1,191 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [{"options": {"0": {"color": "red", "index": 0, "text": "DOWN"}, "1": {"color": "green", "index": 1, "text": "UP"}}, "type": "value"}], + "thresholds": {"mode": "absolute", "steps": [{"color": "red", "value": null}, {"color": "green", "value": 1}]} + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, + "id": 1, + "options": {"orientation": "auto", "reduceOptions": {"values": false, "calcs": ["lastNotNull"], "fields": ""}, "showThresholdLabels": false, "showThresholdMarkers": true}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_node_up", "refId": "A"}], + "title": "Node Status", + "type": "gauge" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": true}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 6, "y": 0}, + "id": 2, + "options": {"legend": {"calcs": [], "displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_node_uptime_seconds", "legendFormat": "Uptime", "refId": "A"}], + "title": "Node Uptime", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 100}, {"color": "red", "value": 500}]}, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0}, + "id": 3, + "options": {"colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"values": false, "calcs": ["lastNotNull"], "fields": ""}, "text": {}, "textMode": "auto"}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_epoch_current", "refId": "A"}], + "title": "Current Epoch", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "thresholds"}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 10}, {"color": "red", "value": 5}]}, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 4, "w": 6, "x": 18, "y": 4}, + "id": 4, + "options": {"colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": {"values": false, "calcs": ["lastNotNull"], "fields": ""}, "text": {}, "textMode": "auto"}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_epoch_enrolled_miners", "refId": "A"}], + "title": "Enrolled Miners", + "type": "stat" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": true}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "id": 5, + "options": {"legend": {"calcs": [], "displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "pluginVersion": "8.0.0", + "targets": [ + {"expr": "rustchain_miners_active", "legendFormat": "Active Miners", "refId": "A"}, + {"expr": "rustchain_miners_total", "legendFormat": "Total Miners", "refId": "B"} + ], + "title": "Miners Overview", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": true}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "id": 6, + "options": {"legend": {"calcs": [], "displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_db_size_bytes", "legendFormat": "DB Size", "refId": "A"}], + "title": "Database Size", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": true}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "RTC" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "id": 7, + "options": {"legend": {"calcs": [], "displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "rustchain_total_supply_rtc", "legendFormat": "Total Supply", "refId": "A"}], + "title": "Total RTC Supply", + "type": "timeseries" + }, + { + "datasource": "Prometheus", + "fieldConfig": { + "defaults": { + "color": {"mode": "palette-classic"}, + "custom": {"axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": {"legend": false, "tooltip": false, "viz": false}, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": {"type": "linear"}, "showPoints": "never", "spanNulls": true}, + "mappings": [], + "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "id": 8, + "options": {"legend": {"calcs": [], "displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "single"}}, + "pluginVersion": "8.0.0", + "targets": [{"expr": "histogram_quantile(0.95, rate(rustchain_api_request_duration_seconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"}], + "title": "API Latency (p95)", + "type": "timeseries" + } + ], + "schemaVersion": 27, + "style": "dark", + "tags": ["rustchain", "blockchain"], + "templating": {"list": []}, + "time": {"from": "now-6h", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "RustChain Node Dashboard", + "uid": "rustchain-node", + "version": 1 +} diff --git a/grafana/prometheus.yml b/grafana/prometheus.yml new file mode 100644 index 00000000..58d9fecb --- /dev/null +++ b/grafana/prometheus.yml @@ -0,0 +1,18 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +rule_files: + - 'alerts-rustchain.yml' + +scrape_configs: + - job_name: 'rustchain-node' + static_configs: + - targets: ['localhost:5000'] + metrics_path: /metrics + scrape_interval: 15s diff --git a/node/rustchain_v2_integrated_v2.2.1_rip200.py b/node/rustchain_v2_integrated_v2.2.1_rip200.py index 2288fd86..c6e5eb15 100644 --- a/node/rustchain_v2_integrated_v2.2.1_rip200.py +++ b/node/rustchain_v2_integrated_v2.2.1_rip200.py @@ -769,6 +769,7 @@ def light_client_static(subpath: str): # Prometheus metrics +# --- Existing metrics --- withdrawal_requests = Counter('rustchain_withdrawal_requests', 'Total withdrawal requests') withdrawal_completed = Counter('rustchain_withdrawal_completed', 'Completed withdrawals') withdrawal_failed = Counter('rustchain_withdrawal_failed', 'Failed withdrawals') @@ -776,6 +777,34 @@ def light_client_static(subpath: str): epoch_gauge = Gauge('rustchain_current_epoch', 'Current epoch') withdrawal_queue_size = Gauge('rustchain_withdrawal_queue', 'Pending withdrawals') +# --- Bounty #765: Prometheus Metrics Exporter --- +# Node health metrics +node_up = Gauge('rustchain_node_up', 'Node health status (1=up, 0=down)') +node_uptime_seconds = Gauge('rustchain_node_uptime_seconds', 'Node uptime in seconds') +node_version_info = Gauge('rustchain_node_version_info', 'Node version info', ['version']) + +# Epoch state metrics +epoch_current = Gauge('rustchain_epoch_current', 'Current epoch number') +epoch_slot = Gauge('rustchain_epoch_slot', 'Current slot within epoch') +epoch_enrolled_miners = Gauge('rustchain_epoch_enrolled_miners', 'Number of enrolled miners') +epoch_pot_rtc = Gauge('rustchain_epoch_pot_rtc', 'Reward pool for current epoch (RTC)') + +# Miner metrics +miners_active = Gauge('rustchain_miners_active', 'Number of active miners') +miners_total = Gauge('rustchain_miners_total', 'Total miners (all time)') +attestation_age_seconds = Gauge('rustchain_attestation_age_seconds', 'Time since last attestation', ['miner']) + +# Balance metrics +total_supply_rtc = Gauge('rustchain_total_supply_rtc', 'Total RTC supply') +wallet_balance_rtc = Gauge('rustchain_wallet_balance_rtc', 'Wallet balance', ['wallet']) + +# Database metrics +db_size_bytes = Gauge('rustchain_db_size_bytes', 'Database size in bytes') +backup_age_hours = Gauge('rustchain_backup_age_hours', 'Age of last backup in hours') + +# API performance histogram +api_request_duration_seconds = Histogram('rustchain_api_request_duration_seconds', 'API request duration', ['endpoint'], buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)) + # Database setup # Allow env override for local dev / different deployments. DB_PATH = os.environ.get("RUSTCHAIN_DB_PATH") or os.environ.get("DB_PATH") or "./rustchain_v2.db" @@ -4428,8 +4457,84 @@ def api_ready(): @app.route('/metrics', methods=['GET']) def metrics(): - """Prometheus metrics endpoint""" - return generate_latest() + """Prometheus metrics endpoint - Bounty #765: Prometheus Metrics Exporter""" + # Update node health metrics + node_up.set(1) + node_uptime_seconds.set(int(time.time() - APP_START_TS)) + node_version_info.labels(version=APP_VERSION).set(1) + + # Update epoch state metrics + try: + with sqlite3.connect(DB_PATH, timeout=3) as conn: + # Get current epoch + epoch_row = conn.execute("SELECT epoch FROM chain_tip LIMIT 1").fetchone() + if epoch_row: + current_epoch = epoch_row[0] + epoch_current.set(current_epoch) + epoch_gauge.set(current_epoch) + + # Get enrolled miners count + enrolled = conn.execute("SELECT COUNT(*) FROM epoch_enrolled WHERE epoch = ?", (current_epoch,)).fetchone() + if enrolled: + epoch_enrolled_miners.set(enrolled[0]) + + # Get pot (rewards per epoch = 1.5 RTC) + epoch_pot_rtc.set(1.5) + except Exception: + pass + + # Update miner metrics + try: + with sqlite3.connect(DB_PATH, timeout=3) as conn: + # Active miners (attested in last 2 epochs) + active = conn.execute("SELECT COUNT(DISTINCT miner) FROM attestations WHERE epoch >= (SELECT MAX(epoch) FROM chain_tip) - 1").fetchone() + if active: + miners_active.set(active[0]) + + # Total miners + total = conn.execute("SELECT COUNT(DISTINCT miner) FROM attestations").fetchone() + if total: + miners_total.set(total[0]) + + # Attestation age for recent miners + now = int(time.time()) + attest_rows = conn.execute( + "SELECT miner, MAX(ts_ok) as last_attest FROM miner_attest_recent GROUP BY miner LIMIT 100" + ).fetchall() + for row in attest_rows: + age = now - row[1] + attestation_age_seconds.labels(miner=row[0]).set(age) + except Exception: + pass + + # Update balance metrics + try: + with sqlite3.connect(DB_PATH, timeout=3) as conn: + # Total supply (sum of all balances) + total_supply = conn.execute("SELECT SUM(balance) FROM accounts").fetchone() + if total_supply and total_supply[0]: + total_supply_rtc.set(total_supply[0] / 1e8) # Convert from base units + except Exception: + pass + + # Update database metrics + try: + db_path = os.path.abspath(DB_PATH) + if os.path.exists(db_path): + db_size_bytes.set(os.path.getsize(db_path)) + except Exception: + pass + + # Backup age + try: + backup_file = DB_PATH + ".bak" + if os.path.exists(backup_file): + backup_age = (time.time() - os.path.getmtime(backup_file)) / 3600 + backup_age_hours.set(backup_age) + except Exception: + pass + + return generate_latest(), 200, {'Content-Type': CONTENT_TYPE_LATEST} @app.route('/rewards/settle', methods=['POST']) diff --git a/tools/cross_node_verifier.py b/tools/cross_node_verifier.py new file mode 100644 index 00000000..951b07be --- /dev/null +++ b/tools/cross_node_verifier.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Cross-Node Ledger Verifier - Bounty #763 +Query all 3 RustChain nodes, compare state, and alert on mismatches. + +Usage: + python tools/cross_node_verifier.py + python tools/cross_node_verifier.py --ci + python tools/cross_node_verifier.py --webhook URL + +Reward: up to 75 RTC +""" + +import argparse +import json +import sqlite3 +import sys +import time +import requests +from datetime import datetime +from typing import Dict, List, Optional, Any +import hashlib +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +NODES = { + "Node 1 (Primary)": {"url": "https://50.28.86.131", "location": "LiquidWeb VPS"}, + "Node 2": {"url": "https://50.28.86.153", "location": "LiquidWeb VPS"}, + "Node 3": {"url": "http://100.88.109.32:8099", "location": "Ryan's Proxmox"}, +} + +ENDPOINTS = {"health": "/health", "epoch": "/epoch", "stats": "/api/stats"} + + +class CrossNodeVerifier: + def __init__(self, db_path: str = "cross_node_verifier.db", timeout: int = 10): + self.timeout = timeout + self.db_path = db_path + self.node_data: Dict[str, Dict[str, Any]] = {} + self.mismatches: List[Dict[str, Any]] = [] + self._init_db() + + def _init_db(self): + conn = sqlite3.connect(self.db_path) + conn.execute(""" + CREATE TABLE IF NOT EXISTS verification_history ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL, + node1_epoch INTEGER, node2_epoch INTEGER, node3_epoch INTEGER, + sync_status TEXT, details TEXT + ) + """) + conn.commit() + conn.close() + + def query_node(self, node_name: str, endpoint: str) -> Optional[Dict]: + url = f"{NODES[node_name]['url']}{endpoint}" + try: + response = requests.get(url, timeout=self.timeout, verify=False) + response.raise_for_status() + return response.json() + except Exception as e: + print(f"Error querying {node_name} {endpoint}: {e}") + return None + + def get_node_health(self, node_name: str) -> Dict[str, Any]: + data = self.query_node(node_name, ENDPOINTS["health"]) + if data: + return {"version": data.get("version", "unknown"), "uptime": data.get("uptime_seconds", 0), "db_ok": data.get("db_rw", True)} + return {"status": "UNREACHABLE"} + + def get_epoch_state(self, node_name: str) -> Dict[str, Any]: + data = self.query_node(node_name, ENDPOINTS["epoch"]) + if data: + return {"epoch": data.get("epoch", 0), "slot": data.get("slot", 0), "enrolled": data.get("enrolled_miners", 0)} + return {"epoch": -1} + + def get_balance(self, node_name: str, miner_id: str) -> Optional[float]: + url = f"{NODES[node_name]['url']}/balance/{miner_id}" + try: + response = requests.get(url, timeout=self.timeout, verify=False) + return response.json().get("balance_rtc", 0) + except: + return None + + def compute_merkle_root(self, data: Dict) -> str: + serialized = json.dumps(data, sort_keys=True) + return hashlib.sha256(serialized.encode()).hexdigest()[:16] + + def verify_all_nodes(self) -> bool: + print("=" * 60) + print("RustChain Cross-Node Verification Report") + print("=" * 60) + print(f"Timestamp: {datetime.utcnow().isoformat()}Z\n") + + for node_name in NODES: + self.node_data[node_name] = { + "health": self.get_node_health(node_name), + "epoch": self.get_epoch_state(node_name), + } + + print("Node Health:") + for node_name, data in self.node_data.items(): + h = data["health"] + print(f" {node_name}: {h.get('version','?')}, uptime {h.get('uptime',0)}s") + + print("\nEpoch State:") + epochs = [] + for node_name, data in self.node_data.items(): + e = data["epoch"] + epochs.append(e["epoch"]) + print(f" {node_name}: epoch={e['epoch']}, slot={e['slot']}, enrolled={e['enrolled']}") + + epoch_match = len(set(epochs)) == 1 + if not epoch_match: + self.mismatches.append({"type": "epoch", "values": epochs}) + + print("\nBalance Spot-Check (founder_community):") + balances = {} + for node_name in NODES: + b = self.get_balance(node_name, "founder_community") + balances[node_name] = b + print(f" {node_name}: {b:.2f} RTC") + + if len(set(balances.values())) > 1: + self.mismatches.append({"type": "balance", "values": balances}) + + print("\nMerkle Roots:") + roots = {} + for node_name, data in self.node_data.items(): + root = self.compute_merkle_root(data) + roots[node_name] = root + print(f" {node_name}: {root}") + + if len(set(roots.values())) > 1: + self.mismatches.append({"type": "merkle", "values": roots}) + + print("\n" + "=" * 60) + in_sync = len(self.mismatches) == 0 + print(f"RESULT: {'ALL NODES IN SYNC' if in_sync else f'MISMATCH ({len(self.mismatches)} issues)'}") + print("=" * 60) + + return in_sync + + def send_webhook(self, url: str): + if not self.mismatches: + return + try: + requests.post(url, json={"mismatches": self.mismatches}, timeout=10) + print(f"Webhook sent to {url}") + except Exception as e: + print(f"Webhook failed: {e}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--ci", action="store_true") + parser.add_argument("--webhook", type=str) + parser.add_argument("--db", default="cross_node_verifier.db") + parser.add_argument("--timeout", type=int, default=10) + args = parser.parse_args() + + verifier = CrossNodeVerifier(args.db, args.timeout) + in_sync = verifier.verify_all_nodes() + + if args.webhook and not in_sync: + verifier.send_webhook(args.webhook) + + sys.exit(0 if in_sync else 1) + + +if __name__ == "__main__": + main()