Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions py/autoevals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,13 @@ async def evaluate_qa():
See individual module documentation for detailed usage and options.
"""

from braintrust_core.score import Score, Scorer

from .json import *
from .list import *
from .llm import *
from .moderation import *
from .number import *
from .oai import init
from .ragas import *
from .score import Score, Scorer
from .string import *
from .value import ExactMatch
2 changes: 1 addition & 1 deletion py/autoevals/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

import json

from braintrust_core.score import Score, Scorer
from jsonschema import ValidationError, validate

from autoevals.partial import ScorerWithPartial

from .number import NumericDiff
from .score import Score, Scorer
from .string import Levenshtein


Expand Down
3 changes: 1 addition & 2 deletions py/autoevals/list.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import sys

from braintrust_core.score import Score

from autoevals.partial import ScorerWithPartial

from .score import Score
from .string import Levenshtein


Expand Down
2 changes: 1 addition & 1 deletion py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@

import chevron
import yaml
from braintrust_core.score import Score

from autoevals.partial import ScorerWithPartial

from .oai import Client, arun_cached_request, run_cached_request
from .score import Score

# Disable HTML escaping in chevron.
chevron.renderer._html_escape = lambda x: x # type: ignore[attr-defined]
Expand Down
3 changes: 1 addition & 2 deletions py/autoevals/moderation.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from typing import Optional

from braintrust_core.score import Score

from autoevals.llm import OpenAIScorer

from .oai import Client, arun_cached_request, run_cached_request
from .score import Score

REQUEST_TYPE = "moderation"

Expand Down
4 changes: 2 additions & 2 deletions py/autoevals/number.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
- Suitable for both small and large number comparisons
"""

from braintrust_core.score import Score

from autoevals.partial import ScorerWithPartial

from .score import Score


class NumericDiff(ScorerWithPartial):
"""Numeric similarity scorer using normalized difference.
Expand Down
2 changes: 1 addition & 1 deletion py/autoevals/partial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from braintrust_core.score import Scorer
from .score import Scorer


class ScorerWithPartial(Scorer):
Expand Down
64 changes: 64 additions & 0 deletions py/autoevals/score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import dataclasses
import sys
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional

from .serializable_data_class import SerializableDataClass


@dataclasses.dataclass
class Score(SerializableDataClass):
"""A score for an evaluation. The score is a float between 0 and 1."""

name: str
"""The name of the score. This should be a unique name for the scorer."""

score: Optional[float]
"""The score for the evaluation. This should be a float between 0 and 1. If the score is None, the evaluation is considered to be skipped."""

metadata: Dict[str, Any] = dataclasses.field(default_factory=dict)
"""Metadata for the score. This can be used to store additional information about the score."""

# DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller.
error: Optional[Exception] = None
"""Deprecated: The error field is deprecated, as errors are now propagated to the caller. The field will be removed in a future version of the library."""

def as_dict(self):
return {
"score": self.score,
"metadata": self.metadata,
}

def __post_init__(self):
if self.score is not None and (self.score < 0 or self.score > 1):
raise ValueError(f"score ({self.score}) must be between 0 and 1")
if self.error is not None:
print(
"The error field is deprecated, as errors are now propagated to the caller. The field will be removed in a future version of the library",
sys.stderr,
)
Comment on lines +36 to +39
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The print statement is using sys.stderr as a positional argument instead of as the file parameter. This will cause the message to print incorrectly. The statement should be modified to:

print(
    "The error field is deprecated, as errors are now propagated to the caller. The field will be removed in a future version of the library",
    file=sys.stderr
)
Suggested change
print(
"The error field is deprecated, as errors are now propagated to the caller. The field will be removed in a future version of the library",
sys.stderr,
)
print(
"The error field is deprecated, as errors are now propagated to the caller. The field will be removed in a future version of the library",
file=sys.stderr,
)

Spotted by Diamond

Is this helpful? React 👍 or 👎 to let us know.



class Scorer(ABC):
async def eval_async(self, output: Any, expected: Any = None, **kwargs: Any) -> Score:
return await self._run_eval_async(output, expected, **kwargs)

def eval(self, output: Any, expected: Any = None, **kwargs: Any) -> Score:
return self._run_eval_sync(output, expected, **kwargs)

def __call__(self, output: Any, expected: Any = None, **kwargs: Any) -> Score:
return self.eval(output, expected, **kwargs)

async def _run_eval_async(self, output: Any, expected: Any = None, **kwargs: Any) -> Score:
# By default we just run the sync version in a thread
return self._run_eval_sync(output, expected, **kwargs)

def _name(self) -> str:
return self.__class__.__name__

@abstractmethod
def _run_eval_sync(self, output: Any, expected: Any = None, **kwargs: Any) -> Score:
...


__all__ = ["Score", "Scorer"]
65 changes: 65 additions & 0 deletions py/autoevals/serializable_data_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import dataclasses
import json
from typing import Dict, Union, get_origin


class SerializableDataClass:
def as_dict(self):
"""Serialize the object to a dictionary."""
return dataclasses.asdict(self)

def as_json(self, **kwargs):
"""Serialize the object to JSON."""
return json.dumps(self.as_dict(), **kwargs)

def __getitem__(self, item: str):
return getattr(self, item)

@classmethod
def from_dict(cls, d: Dict):
"""Deserialize the object from a dictionary. This method
is shallow and will not call from_dict() on nested objects."""
fields = set(f.name for f in dataclasses.fields(cls))
filtered = {k: v for k, v in d.items() if k in fields}
return cls(**filtered)

@classmethod
def from_dict_deep(cls, d: Dict):
"""Deserialize the object from a dictionary. This method
is deep and will call from_dict_deep() on nested objects."""
fields = {f.name: f for f in dataclasses.fields(cls)}
filtered = {}
for k, v in d.items():
if k not in fields:
continue

if (
isinstance(v, dict)
and isinstance(fields[k].type, type)
and issubclass(fields[k].type, SerializableDataClass)
):
filtered[k] = fields[k].type.from_dict_deep(v)
elif get_origin(fields[k].type) == Union:
for t in fields[k].type.__args__:
if t == type(None) and v is None:
filtered[k] = None
break
if isinstance(t, type) and issubclass(t, SerializableDataClass) and v is not None:
try:
filtered[k] = t.from_dict_deep(v)
break
except TypeError:
pass
else:
filtered[k] = v
elif (
isinstance(v, list)
and get_origin(fields[k].type) == list
and len(fields[k].type.__args__) == 1
and isinstance(fields[k].type.__args__[0], type)
and issubclass(fields[k].type.__args__[0], SerializableDataClass)
):
filtered[k] = [fields[k].type.__args__[0].from_dict_deep(i) for i in v]
else:
filtered[k] = v
return cls(**filtered)
2 changes: 1 addition & 1 deletion py/autoevals/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
import threading
from typing import Optional

from braintrust_core.score import Score
from polyleven import levenshtein as distance

from autoevals.partial import ScorerWithPartial
from autoevals.value import normalize_value

from .oai import LLMClient, arun_cached_request, run_cached_request
from .score import Score


class Levenshtein(ScorerWithPartial):
Expand Down
62 changes: 62 additions & 0 deletions py/autoevals/test_serializable_data_class.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import unittest
from dataclasses import dataclass
from typing import List, Optional

from .serializable_data_class import SerializableDataClass


@dataclass
class PromptData(SerializableDataClass):
prompt: Optional[str] = None
options: Optional[dict] = None


@dataclass
class PromptSchema(SerializableDataClass):
id: str
project_id: str
_xact_id: str
name: str
slug: str
description: Optional[str]
prompt_data: PromptData
tags: Optional[List[str]]


class TestSerializableDataClass(unittest.TestCase):
def test_from_dict_deep_with_none_values(self):
"""Test that from_dict_deep correctly handles None values in nested objects."""
test_dict = {
"id": "456",
"project_id": "123",
"_xact_id": "789",
"name": "test-prompt",
"slug": "test-prompt",
"description": None,
"prompt_data": {"prompt": None, "options": None},
"tags": None,
}

prompt = PromptSchema.from_dict_deep(test_dict)

# Verify all fields were set correctly.
self.assertEqual(prompt.id, "456")
self.assertEqual(prompt.project_id, "123")
self.assertEqual(prompt._xact_id, "789")
self.assertEqual(prompt.name, "test-prompt")
self.assertEqual(prompt.slug, "test-prompt")
self.assertIsNone(prompt.description)
self.assertIsNone(prompt.tags)

# Verify nested object was created and its fields are None.
self.assertIsInstance(prompt.prompt_data, PromptData)
self.assertIsNone(prompt.prompt_data.prompt)
self.assertIsNone(prompt.prompt_data.options)

# Verify round-trip serialization works.
round_trip = PromptSchema.from_dict_deep(prompt.as_dict())
self.assertEqual(round_trip.as_dict(), test_dict)


if __name__ == "__main__":
unittest.main()
4 changes: 2 additions & 2 deletions py/autoevals/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@
import json
from typing import Any

from braintrust_core.score import Score

from autoevals.partial import ScorerWithPartial

from .score import Score


class ExactMatch(ScorerWithPartial):
"""A scorer that tests for exact equality between values.
Expand Down