-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
328 lines (269 loc) · 9.45 KB
/
Copy pathapp.py
File metadata and controls
328 lines (269 loc) · 9.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
"""
AIOPS 演示系统 - 主服务入口
实现四大核心能力:
1. 因果分析 - 知道"谁干的"
2. 智能决策 - 把排查变成"选择题"
3. 业务感知 - 懂技术又懂业务
4. Copilot - 自然语言交互
"""
from flask import Flask, render_template, jsonify, request
from datetime import datetime, timedelta
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from engine.causality import CausalityEngine, Alert, Severity, ChangeEvent
from engine.decision import DecisionEngine
from engine.knowledge import KnowledgeEngine
from services.mock_data import MockMetricsService, MockGitService, MockAlertService
app = Flask(__name__)
# 初始化各引擎
causality_engine = CausalityEngine()
decision_engine = DecisionEngine()
knowledge_engine = KnowledgeEngine()
# 初始化Mock服务
metrics_service = MockMetricsService()
git_service = MockGitService()
alert_service = MockAlertService()
# 配置服务依赖关系
causality_engine.register_service("order_service", ["payment_service", "inventory_service"])
causality_engine.register_service("payment_service", ["database"])
causality_engine.register_service("inventory_service", ["database"])
causality_engine.register_service("user_service", ["database", "cache"])
causality_engine.register_service("recommendation_service", ["user_service", "cache"])
causality_engine.register_service("notification_service", ["user_service"])
causality_engine.register_service("database", [])
causality_engine.register_service("cache", [])
def setup_sample_data():
"""设置示例数据 - 预置一些变更记录"""
now = datetime.now()
sample_changes = [
ChangeEvent(
commit_id="a1b2c3d",
author="张伟",
service="order_service",
file_path="OrderService.java",
line_number=45,
change_type="modify",
description="优化订单查询逻辑,添加全表扫描以兼容老版本",
timestamp=now - timedelta(hours=2),
),
ChangeEvent(
commit_id="e5f6g7h",
author="李娜",
service="payment_service",
file_path="PaymentController.java",
line_number=120,
change_type="modify",
description="修复支付超时处理逻辑,添加重试机制",
timestamp=now - timedelta(hours=5),
),
ChangeEvent(
commit_id="i8j9k0l",
author="王强",
service="inventory_service",
file_path="InventoryDao.java",
line_number=88,
change_type="add",
description="新增库存预扣减功能,使用批量更新",
timestamp=now - timedelta(days=1),
),
]
for change in sample_changes:
causality_engine.record_change(change)
# 初始化示例数据
setup_sample_data()
@app.route("/")
def index():
"""主页"""
return render_template("dashboard.html")
@app.route("/api/metrics")
def get_metrics():
"""获取所有服务指标"""
metrics = metrics_service.get_all_metrics()
# 按服务分组
service_metrics = {}
for metric in metrics:
if metric.service not in service_metrics:
service_metrics[metric.service] = {}
service_metrics[metric.service][metric.metric_name] = {
"value": metric.value,
"unit": metric.unit,
"threshold": metric.threshold,
"status": metric.status,
}
return jsonify({
"timestamp": datetime.now().isoformat(),
"services": service_metrics,
})
@app.route("/api/alerts")
def get_alerts():
"""获取活跃告警"""
return jsonify({
"alerts": alert_service.get_active_alerts(),
})
@app.route("/api/analyze", methods=["POST"])
def analyze_alert():
"""
分析告警 - 核心API
实现「别告诉我有问题,告诉我谁干的」
"""
data = request.json
alert_type = data.get("alert_type", "high_cpu")
service = data.get("service", "order_service")
# 生成告警
alert_event = alert_service.generate_alert(alert_type, service)
# 构建Alert对象
severity_map = {
"critical": Severity.P0_CRITICAL,
"high": Severity.P1_HIGH,
"medium": Severity.P2_MEDIUM,
"low": Severity.P3_LOW,
}
alert = Alert(
alert_id=alert_event["alert_id"],
service=service,
metric=alert_event["metric"],
value=alert_event["value"],
threshold=alert_event["threshold"],
severity=severity_map.get(alert_event["severity"], Severity.P2_MEDIUM),
timestamp=datetime.now(),
symptom=alert_event["symptom"],
)
# 因果分析
root_cause = causality_engine.analyze(alert)
# 决策推荐
decision = decision_engine.recommend(
alert_type=alert_event["metric"],
context={
"alert_id": alert_event["alert_id"],
"service": service,
"symptom": alert_event["symptom"],
"status": alert_event["severity"],
},
)
# 业务感知
escalation = knowledge_engine.should_escalate(service, alert_event["severity"])
result = {
"alert": alert_event,
"root_cause": None,
"decision": {
"problem_summary": decision.problem_summary,
"options": [
{
"label": opt.label,
"description": opt.description,
"success_rate": opt.success_rate,
"risk_level": opt.risk_level.value,
"risk_description": opt.risk_description,
"estimated_downtime": opt.estimated_downtime,
"auto_executable": opt.auto_executable,
}
for opt in decision.options
],
"recommended_option": decision.recommended_option,
"reasoning": decision.reasoning,
},
"escalation": escalation,
}
if root_cause:
result["root_cause"] = {
"commit_id": root_cause.commit_id,
"author": root_cause.author,
"file_path": root_cause.file_path,
"line_number": root_cause.line_number,
"description": root_cause.description,
"confidence": root_cause.confidence,
"solution": root_cause.solution,
"related_services": root_cause.related_services,
}
return jsonify(result)
@app.route("/api/execute", methods=["POST"])
def execute_action():
"""
执行决策操作
实现「把排查变成做选择题」后的执行
"""
data = request.json
alert_id = data.get("alert_id")
action = data.get("action")
auto_approved = data.get("auto_approved", False)
result = decision_engine.execute(alert_id, action, auto_approved)
return jsonify(result)
@app.route("/api/copilot", methods=["POST"])
def copilot_chat():
"""
Copilot 交互
实现「是伙伴、是助手、是副驾」
"""
data = request.json
query = data.get("query", "")
# 生成自然语言报告
report = knowledge_engine.generate_report(query)
return jsonify({
"query": query,
"report": report,
"suggestions": [
"帮我分析昨晚的支付服务故障",
"上周的可用性如何",
"生成周报",
"order_service 的状态",
],
})
@app.route("/api/changes")
def get_changes():
"""获取代码变更历史"""
service = request.args.get("service")
hours = int(request.args.get("hours", 24))
changes = git_service.get_recent_changes(service, hours)
return jsonify({
"changes": changes,
})
@app.route("/api/services")
def get_services():
"""获取所有服务列表及配置"""
services = []
for name, config in knowledge_engine.service_configs.items():
services.append({
"name": name,
"importance": config.importance.value,
"time_sensitivity": config.time_sensitivity.value,
"business_impact": config.business_impact,
"escalation_contact": config.escalation_contact,
"sla_threshold": config.sla_threshold,
})
return jsonify({
"services": services,
})
@app.route("/api/demo/scenario", methods=["POST"])
def run_demo_scenario():
"""
运行预设演示场景
"""
scenario = request.json.get("scenario", "db_oom")
if scenario == "db_oom":
# 场景:数据库连接池耗尽
alert_type = "db_pool_exhausted"
service = "order_service"
description = "模拟场景:order_service 数据库连接池耗尽,原因是张伟在 OrderService.java:45 添加了全表扫描"
elif scenario == "high_cpu":
alert_type = "high_cpu"
service = "payment_service"
description = "模拟场景:payment_service CPU使用率飙升,疑似流量激增"
elif scenario == "oom":
alert_type = "oom"
service = "recommendation_service"
description = "模拟场景:recommendation_service OOM,可能是内存泄漏"
else:
return jsonify({"error": "未知场景"}), 400
return jsonify({
"scenario": scenario,
"description": description,
"alert_type": alert_type,
"service": service,
})
if __name__ == "__main__":
print("=" * 60)
print("AIOPS 演示系统启动中...")
print("访问 http://localhost:5001 查看仪表盘")
print("=" * 60)
app.run(host="0.0.0.0", port=5001, debug=True)