diff --git a/py/autoevals/__init__.py b/py/autoevals/__init__.py index 807988e..64e3a5c 100644 --- a/py/autoevals/__init__.py +++ b/py/autoevals/__init__.py @@ -51,7 +51,7 @@ ```python import os from openai import AsyncOpenAI -from autoevals.llm import Correctness +from autoevals.llm import Factuality # Configure client to use Braintrust AI Proxy client = AsyncOpenAI( @@ -60,7 +60,7 @@ ) # Use with any evaluator -evaluator = Correctness(client=client) +evaluator = Factuality(client=client) ``` **Braintrust integration**: diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index f07272f..ad083cf 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -723,106 +723,3 @@ class Translation(SpecFileClassifier): """ pass - - -class Correctness(SpecFileClassifier): - """Evaluate if a solution correctly solves a given problem. - - This evaluator uses LLM-based analysis to determine if a solution correctly - addresses the given problem requirements, considering aspects like: - - Functional correctness - - Edge case handling - - Input validation - - Output format compliance - - Implementation completeness - - Example: - ```python - from openai import OpenAI - from autoevals import Correctness - - correctness = Correctness(client=OpenAI()) - result = correctness.eval( - instructions=''' - Write a function that takes a list of integers and returns their sum. - The function should handle empty lists by returning 0. - ''', - output=''' - def sum_list(numbers): - if not numbers: - return 0 - return sum(numbers) - ''' - ) - - print(result.score) # 1 if correct, 0 if incorrect - print(result.metadata["rationale"]) # Detailed explanation - print(result.metadata["choice"]) # Selected choice (correct/incorrect) - ``` - - Args: - instructions: Problem description or task requirements to evaluate against - output: Solution to evaluate (code, text, or other content) - - Returns: - Score object with: - - score: 1 if solution is correct, 0 if incorrect - - metadata.rationale: Detailed explanation of the evaluation - - metadata.choice: Selected choice (correct/incorrect) - """ - - pass - - -class Complexity(SpecFileClassifier): - """Evaluate the complexity and efficiency of a solution. - - This evaluator uses LLM-based analysis to assess various aspects of solution complexity: - - Time complexity (Big O notation) - - Space complexity - - Code readability and maintainability - - Implementation efficiency - - Resource utilization - - Algorithmic optimizations - - Design patterns and best practices - - Example: - ```python - from autoevals import Complexity - - complexity = Complexity(client=OpenAI()) - result = complexity.eval( - instructions="Implement a function to find duplicates in a list", - output=''' - def find_duplicates(arr): - seen = set() - duplicates = set() - for x in arr: - if x in seen: - duplicates.add(x) - seen.add(x) - return list(duplicates) - ''' - ) - - print(result.score) # 1 if efficient, 0 if inefficient - print(result.metadata["rationale"]) # Detailed complexity analysis - print(result.metadata["choice"]) # Selected choice (efficient/inefficient) - print(result.metadata["time_complexity"]) # Estimated Big O notation - print(result.metadata["space_complexity"]) # Space usage analysis - ``` - - Args: - instructions: Problem description or requirements to evaluate against - output: Solution to analyze for complexity (code, algorithm, system design) - - Returns: - Score object with: - - score: 1 if efficient, 0 if inefficient - - metadata.rationale: Detailed complexity analysis - - metadata.choice: Selected choice (efficient/inefficient) - - metadata.time_complexity: Time complexity analysis - - metadata.space_complexity: Space complexity analysis - """ - - pass diff --git a/py/autoevals/version.py b/py/autoevals/version.py index 86d1f97..b726015 100644 --- a/py/autoevals/version.py +++ b/py/autoevals/version.py @@ -1 +1 @@ -VERSION = "0.0.125" +VERSION = "0.0.126"