-
-
Notifications
You must be signed in to change notification settings - Fork 722
Description
create eval framework for praisonaiagents, minimal code implementation on client side, keep it as minimal as possible
💡 Proposed PraisonAI Agents Implementation
1. Accuracy Evaluation
from praisonaiagents import Agent
from praisonaiagents.eval import AccuracyEvalBasic usage
agent = Agent(
name="Analyst",
role="Data Analyst",
llm="gpt-4"
)Simple accuracy check
eval = AccuracyEval(
agent=agent,
input="What is the capital of France?",
expected_output="Paris"
)result = eval.run()
print(f"Accuracy: {result.score}/10")
Advanced accuracy evaluation:from praisonaiagents.eval import AccuracyEval, EvalCriteria
Multi-criteria evaluation
eval = AccuracyEval(
agent=agent,
test_cases=[
{
"input": "Summarize the Q1 report",
"expected_output": "Q1 showed 15% growth...",
"weight": 2.0 # Higher importance
},
{
"input": "What are the key risks?",
"expected_output": "Supply chain, market volatility..."
}
],
criteria=EvalCriteria(
factual_accuracy=0.4, # 40% weight
completeness=0.3, # 30% weight
relevance=0.3 # 30% weight
),
evaluator_llm="gpt-4", # Model for evaluation
iterations=5, # Statistical reliability
save_results="eval_results.json"
)Run with detailed output
result = eval.run(verbose=True)
Access statistics
print(f"Average: {result.avg_score:.2f}")
print(f"Std Dev: {result.std_dev:.2f}")
print(f"Confidence: {result.confidence_interval}")2. Reliability Testing
from praisonaiagents.eval import ReliabilityEval
Test if agent uses expected tools
eval = ReliabilityEval(
agent=agent,
test_scenarios=[
{
"input": "Search weather and create report",
"expected_tools": ["web_search", "create_file"],
"required_order": True # Tools must be called in order
},
{
"input": "Analyze CSV data",
"expected_tools": ["read_csv", "analyze_data"],
"allow_additional": True # Other tools allowed
}
]
)results = eval.run()
for scenario in results.scenarios:
print(f"Scenario: {scenario.name} - {scenario.status}")
if scenario.failed_tools:
print(f" Failed: {scenario.failed_tools}")3. Performance Evaluation
from praisonaiagents.eval import PerformanceEval
Benchmark agent performance
eval = PerformanceEval(
agent=agent,
benchmark_queries=[
"Simple question",
"Complex analysis task",
"Multi-step reasoning"
],
metrics={
"runtime": True,
"memory": True,
"tokens": True, # Token usage tracking
"ttft": True # Time to first token
},
iterations=50,
warmup=5
)result = eval.run()
Detailed performance report
result.print_report()
Outputs table with avg, p50, p95, p99 for each metric
Compare agents
comparison = PerformanceEval.compare(
agents=[agent1, agent2, agent3],
benchmark_suite="standard", # Predefined benchmarks
export_format="html" # Visual comparison report
)4. Automated Test Suite
from praisonaiagents.eval import EvalSuite, TestCase
Define comprehensive test suite
suite = EvalSuite(
name="Agent Quality Assurance",
agents=[agent],
test_cases=[
TestCase(
name="Basic Math",
input="What is 15 * 23?",
expected_output="345",
eval_type="accuracy",
tags=["math", "simple"]
),
TestCase(
name="Tool Usage",
input="Search and summarize AI news",
expected_tools=["web_search", "summarize"],
eval_type="reliability"
),
TestCase(
name="Performance Baseline",
input="Standard benchmark query",
max_runtime=2.0, # seconds
max_memory=100, # MB
eval_type="performance"
)
],
# Automation features
schedule="0 2 * * *", # Run daily at 2 AM
alerts={
"email": "[email protected]",
"threshold": 0.8 # Alert if score < 80%
},
export_results="s3://bucket/eval-results/"
)Run full suite
results = suite.run()
CI/CD integration
if not results.passed:
raise EvalFailure(f"Quality gate failed: {results.summary}")Generate report
suite.generate_report(
format="html",
include_graphs=True,
compare_with="last_week"
)5. Integration with Existing PraisonAI Features
from praisonaiagents import Agent, Process, Task
from praisonaiagents.eval import EvalSuite
from praisonaiagents.memory import Memory
from praisonaiagents.tools import ToolsEvaluation-aware agent with memory
agent = Agent(
name="EvalAgent",
llm="gpt-4",
memory=Memory(
provider="rag",
quality_threshold=0.8
),
tools=Tools(["web_search", "calculator"]),
# Built-in evaluation
eval_config={
"track_accuracy": True,
"sample_rate": 0.1, # Evaluate 10% of runs
"baseline": "eval_baseline.json"
}
)Process with automatic evaluation
process = Process(
agents=[agent],
tasks=[task1, task2],
# Enable evaluation mode
eval_mode=True,
eval_criteria={
"min_accuracy": 0.85,
"max_runtime": 5.0
}
)Run with evaluation
result = process.start()
Access evaluation metrics
print(f"Process accuracy: {result.eval_metrics.accuracy}")
print(f"Task performances: {result.eval_metrics.task_times}")Save evaluation data for analysis
result.eval_metrics.export("process_eval.json")