|
5 | 5 | # the root directory of this source tree. |
6 | 6 |
|
7 | 7 | import os |
8 | | -import unittest |
9 | 8 | from unittest.mock import MagicMock, patch |
10 | 9 |
|
11 | 10 | import pytest |
12 | 11 |
|
13 | 12 | from llama_stack.apis.benchmarks import Benchmark |
14 | 13 | from llama_stack.apis.common.job_types import Job, JobStatus |
15 | 14 | from llama_stack.apis.eval.eval import BenchmarkConfig, EvaluateResponse, ModelCandidate, SamplingParams |
| 15 | +from llama_stack.apis.inference.inference import TopPSamplingStrategy |
| 16 | +from llama_stack.apis.resource import ResourceType |
16 | 17 | from llama_stack.models.llama.sku_types import CoreModelId |
17 | 18 | from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig |
18 | 19 | from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl |
|
21 | 22 | MOCK_BENCHMARK_ID = "test-benchmark" |
22 | 23 |
|
23 | 24 |
|
24 | | -class TestNVIDIAEvalImpl(unittest.TestCase): |
25 | | - def setUp(self): |
26 | | - os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test" |
27 | | - |
28 | | - # Create mock APIs |
29 | | - self.datasetio_api = MagicMock() |
30 | | - self.datasets_api = MagicMock() |
31 | | - self.scoring_api = MagicMock() |
32 | | - self.inference_api = MagicMock() |
33 | | - self.agents_api = MagicMock() |
34 | | - |
35 | | - self.config = NVIDIAEvalConfig( |
36 | | - evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"], |
37 | | - ) |
38 | | - |
39 | | - self.eval_impl = NVIDIAEvalImpl( |
40 | | - config=self.config, |
41 | | - datasetio_api=self.datasetio_api, |
42 | | - datasets_api=self.datasets_api, |
43 | | - scoring_api=self.scoring_api, |
44 | | - inference_api=self.inference_api, |
45 | | - agents_api=self.agents_api, |
46 | | - ) |
47 | | - |
48 | | - # Mock the HTTP request methods |
49 | | - self.evaluator_get_patcher = patch( |
50 | | - "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get" |
51 | | - ) |
52 | | - self.evaluator_post_patcher = patch( |
53 | | - "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post" |
54 | | - ) |
55 | | - |
56 | | - self.mock_evaluator_get = self.evaluator_get_patcher.start() |
57 | | - self.mock_evaluator_post = self.evaluator_post_patcher.start() |
58 | | - |
59 | | - def tearDown(self): |
60 | | - """Clean up after each test.""" |
61 | | - self.evaluator_get_patcher.stop() |
62 | | - self.evaluator_post_patcher.stop() |
63 | | - |
64 | | - def _assert_request_body(self, expected_json): |
65 | | - """Helper method to verify request body in Evaluator POST request is correct""" |
66 | | - call_args = self.mock_evaluator_post.call_args |
67 | | - actual_json = call_args[0][1] |
68 | | - |
69 | | - # Check that all expected keys contain the expected values in the actual JSON |
70 | | - for key, value in expected_json.items(): |
71 | | - assert key in actual_json, f"Key '{key}' missing in actual JSON" |
72 | | - |
73 | | - if isinstance(value, dict): |
74 | | - for nested_key, nested_value in value.items(): |
75 | | - assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']" |
76 | | - assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'" |
77 | | - else: |
78 | | - assert actual_json[key] == value, f"Value mismatch for '{key}'" |
79 | | - |
80 | | - @pytest.fixture(autouse=True) |
81 | | - def inject_fixtures(self, run_async): |
82 | | - self.run_async = run_async |
83 | | - |
84 | | - def test_register_benchmark(self): |
85 | | - eval_config = { |
86 | | - "type": "custom", |
87 | | - "params": {"parallelism": 8}, |
88 | | - "tasks": { |
89 | | - "qa": { |
90 | | - "type": "completion", |
91 | | - "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}}, |
92 | | - "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"}, |
93 | | - "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}}, |
94 | | - } |
95 | | - }, |
| 25 | +@pytest.fixture |
| 26 | +def nvidia_eval_setup(): |
| 27 | + """Set up the NVIDIA eval implementation with mocked dependencies.""" |
| 28 | + os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test" |
| 29 | + |
| 30 | + # Create mock APIs |
| 31 | + datasetio_api = MagicMock() |
| 32 | + datasets_api = MagicMock() |
| 33 | + scoring_api = MagicMock() |
| 34 | + inference_api = MagicMock() |
| 35 | + agents_api = MagicMock() |
| 36 | + |
| 37 | + config = NVIDIAEvalConfig( |
| 38 | + evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"], |
| 39 | + ) |
| 40 | + |
| 41 | + eval_impl = NVIDIAEvalImpl( |
| 42 | + config=config, |
| 43 | + datasetio_api=datasetio_api, |
| 44 | + datasets_api=datasets_api, |
| 45 | + scoring_api=scoring_api, |
| 46 | + inference_api=inference_api, |
| 47 | + agents_api=agents_api, |
| 48 | + ) |
| 49 | + |
| 50 | + # Mock the HTTP request methods |
| 51 | + with ( |
| 52 | + patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get, |
| 53 | + patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post, |
| 54 | + ): |
| 55 | + yield { |
| 56 | + "eval_impl": eval_impl, |
| 57 | + "mock_evaluator_get": mock_evaluator_get, |
| 58 | + "mock_evaluator_post": mock_evaluator_post, |
| 59 | + "datasetio_api": datasetio_api, |
| 60 | + "datasets_api": datasets_api, |
| 61 | + "scoring_api": scoring_api, |
| 62 | + "inference_api": inference_api, |
| 63 | + "agents_api": agents_api, |
96 | 64 | } |
97 | 65 |
|
98 | | - benchmark = Benchmark( |
99 | | - provider_id="nvidia", |
100 | | - type="benchmark", |
101 | | - identifier=MOCK_BENCHMARK_ID, |
102 | | - dataset_id=MOCK_DATASET_ID, |
103 | | - scoring_functions=["basic::equality"], |
104 | | - metadata=eval_config, |
105 | | - ) |
106 | | - |
107 | | - # Mock Evaluator API response |
108 | | - mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"} |
109 | | - self.mock_evaluator_post.return_value = mock_evaluator_response |
110 | | - |
111 | | - # Register the benchmark |
112 | | - self.run_async(self.eval_impl.register_benchmark(benchmark)) |
113 | | - |
114 | | - # Verify the Evaluator API was called correctly |
115 | | - self.mock_evaluator_post.assert_called_once() |
116 | | - self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}) |
117 | | - |
118 | | - def test_run_eval(self): |
119 | | - benchmark_config = BenchmarkConfig( |
120 | | - eval_candidate=ModelCandidate( |
121 | | - type="model", |
122 | | - model=CoreModelId.llama3_1_8b_instruct.value, |
123 | | - sampling_params=SamplingParams(max_tokens=100, temperature=0.7), |
124 | | - ) |
125 | | - ) |
126 | | - |
127 | | - # Mock Evaluator API response |
128 | | - mock_evaluator_response = {"id": "job-123", "status": "created"} |
129 | | - self.mock_evaluator_post.return_value = mock_evaluator_response |
130 | | - |
131 | | - # Run the Evaluation job |
132 | | - result = self.run_async( |
133 | | - self.eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config) |
134 | | - ) |
135 | 66 |
|
136 | | - # Verify the Evaluator API was called correctly |
137 | | - self.mock_evaluator_post.assert_called_once() |
138 | | - self._assert_request_body( |
139 | | - { |
140 | | - "config": f"nvidia/{MOCK_BENCHMARK_ID}", |
141 | | - "target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"}, |
| 67 | +def _assert_request_body(mock_evaluator_post, expected_json): |
| 68 | + """Helper method to verify request body in Evaluator POST request is correct""" |
| 69 | + call_args = mock_evaluator_post.call_args |
| 70 | + actual_json = call_args[0][1] |
| 71 | + |
| 72 | + # Check that all expected keys contain the expected values in the actual JSON |
| 73 | + for key, value in expected_json.items(): |
| 74 | + assert key in actual_json, f"Key '{key}' missing in actual JSON" |
| 75 | + |
| 76 | + if isinstance(value, dict): |
| 77 | + for nested_key, nested_value in value.items(): |
| 78 | + assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']" |
| 79 | + assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'" |
| 80 | + else: |
| 81 | + assert actual_json[key] == value, f"Value mismatch for '{key}'" |
| 82 | + |
| 83 | + |
| 84 | +@pytest.mark.asyncio |
| 85 | +async def test_register_benchmark(nvidia_eval_setup): |
| 86 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 87 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 88 | + |
| 89 | + eval_config = { |
| 90 | + "type": "custom", |
| 91 | + "params": {"parallelism": 8}, |
| 92 | + "tasks": { |
| 93 | + "qa": { |
| 94 | + "type": "completion", |
| 95 | + "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}}, |
| 96 | + "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"}, |
| 97 | + "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}}, |
142 | 98 | } |
| 99 | + }, |
| 100 | + } |
| 101 | + |
| 102 | + benchmark = Benchmark( |
| 103 | + provider_id="nvidia", |
| 104 | + type=ResourceType.benchmark, |
| 105 | + identifier=MOCK_BENCHMARK_ID, |
| 106 | + dataset_id=MOCK_DATASET_ID, |
| 107 | + scoring_functions=["basic::equality"], |
| 108 | + metadata=eval_config, |
| 109 | + ) |
| 110 | + |
| 111 | + # Mock Evaluator API response |
| 112 | + mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"} |
| 113 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 114 | + |
| 115 | + # Register the benchmark |
| 116 | + await eval_impl.register_benchmark(benchmark) |
| 117 | + |
| 118 | + # Verify the Evaluator API was called correctly |
| 119 | + mock_evaluator_post.assert_called_once() |
| 120 | + _assert_request_body( |
| 121 | + mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config} |
| 122 | + ) |
| 123 | + |
| 124 | + |
| 125 | +@pytest.mark.asyncio |
| 126 | +async def test_run_eval(nvidia_eval_setup): |
| 127 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 128 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 129 | + |
| 130 | + benchmark_config = BenchmarkConfig( |
| 131 | + eval_candidate=ModelCandidate( |
| 132 | + type="model", |
| 133 | + model=CoreModelId.llama3_1_8b_instruct.value, |
| 134 | + sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)), |
143 | 135 | ) |
144 | | - |
145 | | - # Verify the result |
146 | | - assert isinstance(result, Job) |
147 | | - assert result.job_id == "job-123" |
148 | | - assert result.status == JobStatus.in_progress |
149 | | - |
150 | | - def test_job_status(self): |
151 | | - # Mock Evaluator API response |
152 | | - mock_evaluator_response = {"id": "job-123", "status": "completed"} |
153 | | - self.mock_evaluator_get.return_value = mock_evaluator_response |
154 | | - |
155 | | - # Get the Evaluation job |
156 | | - result = self.run_async(self.eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
157 | | - |
158 | | - # Verify the result |
159 | | - assert isinstance(result, Job) |
160 | | - assert result.job_id == "job-123" |
161 | | - assert result.status == JobStatus.completed |
162 | | - |
163 | | - # Verify the API was called correctly |
164 | | - self.mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}") |
165 | | - |
166 | | - def test_job_cancel(self): |
167 | | - # Mock Evaluator API response |
168 | | - mock_evaluator_response = {"id": "job-123", "status": "cancelled"} |
169 | | - self.mock_evaluator_post.return_value = mock_evaluator_response |
170 | | - |
171 | | - # Cancel the Evaluation job |
172 | | - self.run_async(self.eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
173 | | - |
174 | | - # Verify the API was called correctly |
175 | | - self.mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {}) |
176 | | - |
177 | | - def test_job_result(self): |
178 | | - # Mock Evaluator API responses |
179 | | - mock_job_status_response = {"id": "job-123", "status": "completed"} |
180 | | - mock_job_results_response = { |
181 | | - "id": "job-123", |
182 | | - "status": "completed", |
183 | | - "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}}, |
184 | | - } |
185 | | - self.mock_evaluator_get.side_effect = [ |
186 | | - mock_job_status_response, # First call to retrieve job |
187 | | - mock_job_results_response, # Second call to retrieve job results |
188 | | - ] |
189 | | - |
190 | | - # Get the Evaluation job results |
191 | | - result = self.run_async(self.eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123")) |
192 | | - |
193 | | - # Verify the result |
194 | | - assert isinstance(result, EvaluateResponse) |
195 | | - assert MOCK_BENCHMARK_ID in result.scores |
196 | | - assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85 |
197 | | - |
198 | | - # Verify the API was called correctly |
199 | | - assert self.mock_evaluator_get.call_count == 2 |
200 | | - self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123") |
201 | | - self.mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results") |
| 136 | + ) |
| 137 | + |
| 138 | + # Mock Evaluator API response |
| 139 | + mock_evaluator_response = {"id": "job-123", "status": "created"} |
| 140 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 141 | + |
| 142 | + # Run the Evaluation job |
| 143 | + result = await eval_impl.run_eval(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config) |
| 144 | + |
| 145 | + # Verify the Evaluator API was called correctly |
| 146 | + mock_evaluator_post.assert_called_once() |
| 147 | + _assert_request_body( |
| 148 | + mock_evaluator_post, |
| 149 | + { |
| 150 | + "config": f"nvidia/{MOCK_BENCHMARK_ID}", |
| 151 | + "target": {"type": "model", "model": "meta/llama-3.1-8b-instruct"}, |
| 152 | + }, |
| 153 | + ) |
| 154 | + |
| 155 | + # Verify the result |
| 156 | + assert isinstance(result, Job) |
| 157 | + assert result.job_id == "job-123" |
| 158 | + assert result.status == JobStatus.in_progress |
| 159 | + |
| 160 | + |
| 161 | +@pytest.mark.asyncio |
| 162 | +async def test_job_status(nvidia_eval_setup): |
| 163 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 164 | + mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"] |
| 165 | + |
| 166 | + # Mock Evaluator API response |
| 167 | + mock_evaluator_response = {"id": "job-123", "status": "completed"} |
| 168 | + mock_evaluator_get.return_value = mock_evaluator_response |
| 169 | + |
| 170 | + # Get the Evaluation job |
| 171 | + result = await eval_impl.job_status(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 172 | + |
| 173 | + # Verify the result |
| 174 | + assert isinstance(result, Job) |
| 175 | + assert result.job_id == "job-123" |
| 176 | + assert result.status == JobStatus.completed |
| 177 | + |
| 178 | + # Verify the API was called correctly |
| 179 | + mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}") |
| 180 | + |
| 181 | + |
| 182 | +@pytest.mark.asyncio |
| 183 | +async def test_job_cancel(nvidia_eval_setup): |
| 184 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 185 | + mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"] |
| 186 | + |
| 187 | + # Mock Evaluator API response |
| 188 | + mock_evaluator_response = {"id": "job-123", "status": "cancelled"} |
| 189 | + mock_evaluator_post.return_value = mock_evaluator_response |
| 190 | + |
| 191 | + # Cancel the Evaluation job |
| 192 | + await eval_impl.job_cancel(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 193 | + |
| 194 | + # Verify the API was called correctly |
| 195 | + mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {}) |
| 196 | + |
| 197 | + |
| 198 | +@pytest.mark.asyncio |
| 199 | +async def test_job_result(nvidia_eval_setup): |
| 200 | + eval_impl = nvidia_eval_setup["eval_impl"] |
| 201 | + mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"] |
| 202 | + |
| 203 | + # Mock Evaluator API responses |
| 204 | + mock_job_status_response = {"id": "job-123", "status": "completed"} |
| 205 | + mock_job_results_response = { |
| 206 | + "id": "job-123", |
| 207 | + "status": "completed", |
| 208 | + "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}}, |
| 209 | + } |
| 210 | + mock_evaluator_get.side_effect = [ |
| 211 | + mock_job_status_response, # First call to retrieve job |
| 212 | + mock_job_results_response, # Second call to retrieve job results |
| 213 | + ] |
| 214 | + |
| 215 | + # Get the Evaluation job results |
| 216 | + result = await eval_impl.job_result(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123") |
| 217 | + |
| 218 | + # Verify the result |
| 219 | + assert isinstance(result, EvaluateResponse) |
| 220 | + assert MOCK_BENCHMARK_ID in result.scores |
| 221 | + assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85 |
| 222 | + |
| 223 | + # Verify the API was called correctly |
| 224 | + assert mock_evaluator_get.call_count == 2 |
| 225 | + mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123") |
| 226 | + mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results") |
0 commit comments