Skip to content

create eval framework for praisonaiagents, mini... #967

@MervinPraison

Description

@MervinPraison

@claude

create eval framework for praisonaiagents, minimal code implementation on client side, keep it as minimal as possible

💡 Proposed PraisonAI Agents Implementation

1. Accuracy Evaluation

from praisonaiagents import Agent
from praisonaiagents.eval import AccuracyEval

Basic usage

agent = Agent(
name="Analyst",
role="Data Analyst",
llm="gpt-4"
)

Simple accuracy check

eval = AccuracyEval(
agent=agent,
input="What is the capital of France?",
expected_output="Paris"
)

result = eval.run()
print(f"Accuracy: {result.score}/10")
Advanced accuracy evaluation:

from praisonaiagents.eval import AccuracyEval, EvalCriteria

Multi-criteria evaluation

eval = AccuracyEval(
agent=agent,
test_cases=[
{
"input": "Summarize the Q1 report",
"expected_output": "Q1 showed 15% growth...",
"weight": 2.0 # Higher importance
},
{
"input": "What are the key risks?",
"expected_output": "Supply chain, market volatility..."
}
],
criteria=EvalCriteria(
factual_accuracy=0.4, # 40% weight
completeness=0.3, # 30% weight
relevance=0.3 # 30% weight
),
evaluator_llm="gpt-4", # Model for evaluation
iterations=5, # Statistical reliability
save_results="eval_results.json"
)

Run with detailed output

result = eval.run(verbose=True)

Access statistics

print(f"Average: {result.avg_score:.2f}")
print(f"Std Dev: {result.std_dev:.2f}")
print(f"Confidence: {result.confidence_interval}")

2. Reliability Testing

from praisonaiagents.eval import ReliabilityEval

Test if agent uses expected tools

eval = ReliabilityEval(
agent=agent,
test_scenarios=[
{
"input": "Search weather and create report",
"expected_tools": ["web_search", "create_file"],
"required_order": True # Tools must be called in order
},
{
"input": "Analyze CSV data",
"expected_tools": ["read_csv", "analyze_data"],
"allow_additional": True # Other tools allowed
}
]
)

results = eval.run()
for scenario in results.scenarios:
print(f"Scenario: {scenario.name} - {scenario.status}")
if scenario.failed_tools:
print(f" Failed: {scenario.failed_tools}")

3. Performance Evaluation

from praisonaiagents.eval import PerformanceEval

Benchmark agent performance

eval = PerformanceEval(
agent=agent,
benchmark_queries=[
"Simple question",
"Complex analysis task",
"Multi-step reasoning"
],
metrics={
"runtime": True,
"memory": True,
"tokens": True, # Token usage tracking
"ttft": True # Time to first token
},
iterations=50,
warmup=5
)

result = eval.run()

Detailed performance report

result.print_report()

Outputs table with avg, p50, p95, p99 for each metric

Compare agents

comparison = PerformanceEval.compare(
agents=[agent1, agent2, agent3],
benchmark_suite="standard", # Predefined benchmarks
export_format="html" # Visual comparison report
)

4. Automated Test Suite

from praisonaiagents.eval import EvalSuite, TestCase

Define comprehensive test suite

suite = EvalSuite(
name="Agent Quality Assurance",
agents=[agent],
test_cases=[
TestCase(
name="Basic Math",
input="What is 15 * 23?",
expected_output="345",
eval_type="accuracy",
tags=["math", "simple"]
),
TestCase(
name="Tool Usage",
input="Search and summarize AI news",
expected_tools=["web_search", "summarize"],
eval_type="reliability"
),
TestCase(
name="Performance Baseline",
input="Standard benchmark query",
max_runtime=2.0, # seconds
max_memory=100, # MB
eval_type="performance"
)
],
# Automation features
schedule="0 2 * * *", # Run daily at 2 AM
alerts={
"email": "[email protected]",
"threshold": 0.8 # Alert if score < 80%
},
export_results="s3://bucket/eval-results/"
)

Run full suite

results = suite.run()

CI/CD integration

if not results.passed:
raise EvalFailure(f"Quality gate failed: {results.summary}")

Generate report

suite.generate_report(
format="html",
include_graphs=True,
compare_with="last_week"
)

5. Integration with Existing PraisonAI Features

from praisonaiagents import Agent, Process, Task
from praisonaiagents.eval import EvalSuite
from praisonaiagents.memory import Memory
from praisonaiagents.tools import Tools

Evaluation-aware agent with memory

agent = Agent(
name="EvalAgent",
llm="gpt-4",
memory=Memory(
provider="rag",
quality_threshold=0.8
),
tools=Tools(["web_search", "calculator"]),
# Built-in evaluation
eval_config={
"track_accuracy": True,
"sample_rate": 0.1, # Evaluate 10% of runs
"baseline": "eval_baseline.json"
}
)

Process with automatic evaluation

process = Process(
agents=[agent],
tasks=[task1, task2],
# Enable evaluation mode
eval_mode=True,
eval_criteria={
"min_accuracy": 0.85,
"max_runtime": 5.0
}
)

Run with evaluation

result = process.start()

Access evaluation metrics

print(f"Process accuracy: {result.eval_metrics.accuracy}")
print(f"Task performances: {result.eval_metrics.task_times}")

Save evaluation data for analysis

result.eval_metrics.export("process_eval.json")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions