Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 5 additions & 11 deletions evalbench/scorers/setmatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
"""

from typing import Tuple
from collections import Counter

from scorers import comparator
from scorers.util import make_hashable
from scorers.comparator import convert_to_set


Expand Down Expand Up @@ -54,17 +56,9 @@ def _is_document_structure(data):
return False

if _is_document_structure(golden_execution_result) or _is_document_structure(generated_execution_result):
def _make_hashable(item):
if isinstance(item, list):
return tuple(_make_hashable(x) for x in item)
elif isinstance(item, dict):
return tuple(sorted((k, _make_hashable(v)) for k, v in item.items()))
else:
return item

h1 = [_make_hashable(d) for d in golden_execution_result]
h2 = [_make_hashable(d) for d in generated_execution_result]
score = 100 if sorted(h1) == sorted(h2) else 0
h1 = [make_hashable(d) for d in golden_execution_result]
h2 = [make_hashable(d) for d in generated_execution_result]
score = 100 if Counter(h1) == Counter(h2) else 0
else:
# SQL Model: flat primitives, ignore column names, remove duplicates
golden_execution_result_tuple = [
Expand Down
2 changes: 1 addition & 1 deletion evalbench/scorers/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def with_cache_execute(

def make_hashable(value):
if isinstance(value, list):
return tuple(value)
return tuple(make_hashable(v) for v in value)
elif isinstance(value, dict):
return frozenset((k, make_hashable(v)) for k, v in value.items())
return value
40 changes: 40 additions & 0 deletions evalbench/test/llmrater_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import unittest
from scorers.llmrater import LLMRater


class TestLLMRater(unittest.TestCase):
def test_take_n_uniques_with_document_model(self):
# A typical Document model returned result containing nested lists of dictionaries
golden = [
{"authors": [{"name": "Alice"}, {"name": "Bob"}]}
]
try:
result = LLMRater.take_n_uniques(golden, 50)
self.assertEqual(len(result), 1)
except TypeError as e:
self.fail(f"take_n_uniques raised TypeError unexpectedly: {e}")

def test_take_n_uniques_with_flat_dict(self):
# Classic SQL row model where results are flat dicts
golden = [
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"},
{"id": 1, "name": "Alice"} # Duplicate should be removed
]
result = LLMRater.take_n_uniques(golden, 50)
self.assertEqual(len(result), 2)

def test_take_n_uniques_limit(self):
# Ensure it respects the 'n' limit
golden = [{"id": i} for i in range(100)]
result = LLMRater.take_n_uniques(golden, 50)
self.assertEqual(len(result), 50)

def test_take_n_uniques_empty(self):
# Edge case: empty list
result = LLMRater.take_n_uniques([], 50)
self.assertEqual(len(result), 0)


if __name__ == '__main__':
unittest.main()
Loading