Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions config/alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
global:
smtp_smarthost: 'localhost:587'
smtp_from: '[email protected]'
resolve_timeout: 5m

route:
group_by: ['severity', 'team']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
# Critical trading alerts - immediate notification
- match:
severity: critical
team: trading
receiver: 'trading-critical'
group_wait: 0s
group_interval: 30s
repeat_interval: 10m

# Platform issues - high priority
- match:
severity: critical
team: platform
receiver: 'platform-critical'
group_wait: 5s
group_interval: 1m
repeat_interval: 30m

# Infrastructure alerts
- match:
team: infrastructure
receiver: 'infrastructure-alerts'
group_wait: 30s
group_interval: 5m
repeat_interval: 2h

# Bridge connectivity issues
- match:
team: bridges
receiver: 'bridge-alerts'
group_wait: 15s
group_interval: 2m
repeat_interval: 1h

receivers:
- name: 'default'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#alerts'
title: 'JuliaOS Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'trading-critical'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#trading-critical'
title: '🚨 CRITICAL TRADING ALERT'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Description:* {{ .Annotations.description }}
*Severity:* {{ .Labels.severity }}
*Time:* {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
color: 'danger'
send_resolved: true
webhook_configs:
- url: 'http://juliaos-server:8052/api/v1/alerts/critical'
send_resolved: true

- name: 'platform-critical'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#platform-alerts'
title: '⚠️ Platform Critical Alert'
text: |
{{ range .Alerts }}
*Alert:* {{ .Annotations.summary }}
*Agent/Component:* {{ .Labels.agent_id }}{{ .Labels.instance }}
*Description:* {{ .Annotations.description }}
{{ end }}
color: 'warning'

- name: 'infrastructure-alerts'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#infrastructure'
title: 'Infrastructure Alert'
text: |
{{ range .Alerts }}
*System:* {{ .Labels.instance }}
*Issue:* {{ .Annotations.summary }}
{{ end }}

- name: 'bridge-alerts'
slack_configs:
- api_url: '${SLACK_WEBHOOK_URL}'
channel: '#bridge-alerts'
title: 'Bridge Connectivity Alert'
text: |
{{ range .Alerts }}
*Bridge:* {{ .Labels.bridge_name }}
*Status:* {{ .Annotations.summary }}
{{ end }}

inhibit_rules:
# Suppress non-critical alerts when critical ones are firing
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['team', 'instance']

# Suppress individual agent alerts when system-wide issues are detected
- source_match:
alertname: 'SystemMemoryHigh'
target_match:
alertname: 'AgentMemoryHigh'
equal: ['instance']
191 changes: 191 additions & 0 deletions config/grafana/dashboards/trading-metrics.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
{
"dashboard": {
"id": null,
"title": "JuliaOS Trading Metrics",
"tags": ["trading", "juliaos", "performance"],
"timezone": "UTC",
"refresh": "5s",
"time": {
"from": "now-1h",
"to": "now"
},
"panels": [
{
"id": 1,
"title": "Trading Performance Overview",
"type": "stat",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 0},
"targets": [
{
"expr": "trading_portfolio_pnl_total",
"legendFormat": "Total P&L"
},
{
"expr": "trading_portfolio_sharpe_ratio",
"legendFormat": "Sharpe Ratio"
},
{
"expr": "trading_portfolio_drawdown_pct",
"legendFormat": "Max Drawdown %"
},
{
"expr": "trading_strategy_win_rate",
"legendFormat": "Win Rate"
}
],
"fieldConfig": {
"defaults": {
"unit": "short",
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 1},
{"color": "green", "value": 2}
]
}
}
}
},
{
"id": 2,
"title": "Execution Latency Distribution",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "histogram_quantile(0.50, trading_execution_latency_seconds)",
"legendFormat": "P50 Latency"
},
{
"expr": "histogram_quantile(0.95, trading_execution_latency_seconds)",
"legendFormat": "P95 Latency"
},
{
"expr": "histogram_quantile(0.99, trading_execution_latency_seconds)",
"legendFormat": "P99 Latency"
}
],
"yAxes": [
{
"label": "Latency (seconds)",
"max": 0.1,
"min": 0
}
],
"thresholds": [
{
"value": 0.01,
"colorMode": "critical",
"op": "gt"
}
]
},
{
"id": 3,
"title": "Agent Health Status",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"expr": "up{job=\"trading-agents\"}",
"legendFormat": "Agent {{agent_id}}"
}
],
"yAxes": [
{
"label": "Status",
"max": 1,
"min": 0
}
]
},
{
"id": 4,
"title": "Portfolio Value Over Time",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"targets": [
{
"expr": "trading_portfolio_value_usd",
"legendFormat": "Portfolio Value (USD)"
}
],
"yAxes": [
{
"label": "Value (USD)",
"logBase": 1
}
]
},
{
"id": 5,
"title": "Risk Metrics",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"targets": [
{
"expr": "risk_var_1d_pct",
"legendFormat": "1-Day VaR %"
},
{
"expr": "risk_expected_shortfall_pct",
"legendFormat": "Expected Shortfall %"
},
{
"expr": "risk_leverage_ratio",
"legendFormat": "Leverage Ratio"
}
],
"yAxes": [
{
"label": "Risk Metric",
"logBase": 1
}
]
},
{
"id": 6,
"title": "Bridge Performance",
"type": "table",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 24},
"targets": [
{
"expr": "bridge_health_status",
"legendFormat": "{{bridge_name}}"
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {},
"indexByName": {},
"renameByName": {
"bridge_name": "Bridge",
"Value": "Status"
}
}
}
]
},
{
"id": 7,
"title": "DEX Trading Volume",
"type": "graph",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 24},
"targets": [
{
"expr": "rate(dex_trade_volume_usd_total[5m])",
"legendFormat": "{{dex_name}} Volume/min"
}
],
"yAxes": [
{
"label": "Volume (USD/min)",
"logBase": 1
}
]
}
]
}
}
70 changes: 70 additions & 0 deletions config/loki/loki.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
auth_enabled: false

server:
http_listen_port: 3100
grpc_listen_port: 9096
log_level: info

common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory

query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100

schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h

ruler:
alertmanager_url: http://alertmanager:9093

# Optimized for high-volume trading logs
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
ingestion_rate_mb: 16
ingestion_burst_size_mb: 32
max_streams_per_user: 10000
max_line_size: 256kb

# Retention policy for trading data
table_manager:
retention_deletes_enabled: true
retention_period: 168h # 7 days for detailed logs

# Performance tuning
chunk_store_config:
max_look_back_period: 24h

ingester:
max_chunk_age: 1h
chunk_target_size: 1536000
chunk_retain_period: 30s
max_transfer_retries: 0
wal:
enabled: true
dir: /loki/wal

# Structured metadata for trading logs
structured_metadata:
enabled: true
max_size_kb: 64
Loading