diff --git a/evalbench/scorers/setmatcher.py b/evalbench/scorers/setmatcher.py index 577362d..18de61a 100644 --- a/evalbench/scorers/setmatcher.py +++ b/evalbench/scorers/setmatcher.py @@ -6,8 +6,10 @@ """ from typing import Tuple +from collections import Counter from scorers import comparator +from scorers.util import make_hashable from scorers.comparator import convert_to_set @@ -54,17 +56,9 @@ def _is_document_structure(data): return False if _is_document_structure(golden_execution_result) or _is_document_structure(generated_execution_result): - def _make_hashable(item): - if isinstance(item, list): - return tuple(_make_hashable(x) for x in item) - elif isinstance(item, dict): - return tuple(sorted((k, _make_hashable(v)) for k, v in item.items())) - else: - return item - - h1 = [_make_hashable(d) for d in golden_execution_result] - h2 = [_make_hashable(d) for d in generated_execution_result] - score = 100 if sorted(h1) == sorted(h2) else 0 + h1 = [make_hashable(d) for d in golden_execution_result] + h2 = [make_hashable(d) for d in generated_execution_result] + score = 100 if Counter(h1) == Counter(h2) else 0 else: # SQL Model: flat primitives, ignore column names, remove duplicates golden_execution_result_tuple = [ diff --git a/evalbench/scorers/util.py b/evalbench/scorers/util.py index 3094234..1d4d5a6 100644 --- a/evalbench/scorers/util.py +++ b/evalbench/scorers/util.py @@ -57,7 +57,7 @@ def with_cache_execute( def make_hashable(value): if isinstance(value, list): - return tuple(value) + return tuple(make_hashable(v) for v in value) elif isinstance(value, dict): return frozenset((k, make_hashable(v)) for k, v in value.items()) return value diff --git a/evalbench/test/llmrater_test.py b/evalbench/test/llmrater_test.py new file mode 100644 index 0000000..1b9cbb7 --- /dev/null +++ b/evalbench/test/llmrater_test.py @@ -0,0 +1,40 @@ +import unittest +from scorers.llmrater import LLMRater + + +class TestLLMRater(unittest.TestCase): + def test_take_n_uniques_with_document_model(self): + # A typical Document model returned result containing nested lists of dictionaries + golden = [ + {"authors": [{"name": "Alice"}, {"name": "Bob"}]} + ] + try: + result = LLMRater.take_n_uniques(golden, 50) + self.assertEqual(len(result), 1) + except TypeError as e: + self.fail(f"take_n_uniques raised TypeError unexpectedly: {e}") + + def test_take_n_uniques_with_flat_dict(self): + # Classic SQL row model where results are flat dicts + golden = [ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + {"id": 1, "name": "Alice"} # Duplicate should be removed + ] + result = LLMRater.take_n_uniques(golden, 50) + self.assertEqual(len(result), 2) + + def test_take_n_uniques_limit(self): + # Ensure it respects the 'n' limit + golden = [{"id": i} for i in range(100)] + result = LLMRater.take_n_uniques(golden, 50) + self.assertEqual(len(result), 50) + + def test_take_n_uniques_empty(self): + # Edge case: empty list + result = LLMRater.take_n_uniques([], 50) + self.assertEqual(len(result), 0) + + +if __name__ == '__main__': + unittest.main()