Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions py/autoevals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
```python
import os
from openai import AsyncOpenAI
from autoevals.llm import Correctness
from autoevals.llm import Factuality

# Configure client to use Braintrust AI Proxy
client = AsyncOpenAI(
Expand All @@ -60,7 +60,7 @@
)

# Use with any evaluator
evaluator = Correctness(client=client)
evaluator = Factuality(client=client)
```

**Braintrust integration**:
Expand Down
103 changes: 0 additions & 103 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,106 +723,3 @@ class Translation(SpecFileClassifier):
"""

pass


class Correctness(SpecFileClassifier):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"""Evaluate if a solution correctly solves a given problem.

This evaluator uses LLM-based analysis to determine if a solution correctly
addresses the given problem requirements, considering aspects like:
- Functional correctness
- Edge case handling
- Input validation
- Output format compliance
- Implementation completeness

Example:
```python
from openai import OpenAI
from autoevals import Correctness

correctness = Correctness(client=OpenAI())
result = correctness.eval(
instructions='''
Write a function that takes a list of integers and returns their sum.
The function should handle empty lists by returning 0.
''',
output='''
def sum_list(numbers):
if not numbers:
return 0
return sum(numbers)
'''
)

print(result.score) # 1 if correct, 0 if incorrect
print(result.metadata["rationale"]) # Detailed explanation
print(result.metadata["choice"]) # Selected choice (correct/incorrect)
```

Args:
instructions: Problem description or task requirements to evaluate against
output: Solution to evaluate (code, text, or other content)

Returns:
Score object with:
- score: 1 if solution is correct, 0 if incorrect
- metadata.rationale: Detailed explanation of the evaluation
- metadata.choice: Selected choice (correct/incorrect)
"""

pass


class Complexity(SpecFileClassifier):
"""Evaluate the complexity and efficiency of a solution.

This evaluator uses LLM-based analysis to assess various aspects of solution complexity:
- Time complexity (Big O notation)
- Space complexity
- Code readability and maintainability
- Implementation efficiency
- Resource utilization
- Algorithmic optimizations
- Design patterns and best practices

Example:
```python
from autoevals import Complexity

complexity = Complexity(client=OpenAI())
result = complexity.eval(
instructions="Implement a function to find duplicates in a list",
output='''
def find_duplicates(arr):
seen = set()
duplicates = set()
for x in arr:
if x in seen:
duplicates.add(x)
seen.add(x)
return list(duplicates)
'''
)

print(result.score) # 1 if efficient, 0 if inefficient
print(result.metadata["rationale"]) # Detailed complexity analysis
print(result.metadata["choice"]) # Selected choice (efficient/inefficient)
print(result.metadata["time_complexity"]) # Estimated Big O notation
print(result.metadata["space_complexity"]) # Space usage analysis
```

Args:
instructions: Problem description or requirements to evaluate against
output: Solution to analyze for complexity (code, algorithm, system design)

Returns:
Score object with:
- score: 1 if efficient, 0 if inefficient
- metadata.rationale: Detailed complexity analysis
- metadata.choice: Selected choice (efficient/inefficient)
- metadata.time_complexity: Time complexity analysis
- metadata.space_complexity: Space complexity analysis
"""

pass
2 changes: 1 addition & 1 deletion py/autoevals/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.0.125"
VERSION = "0.0.126"