feat: create HTTP DELETE API endpoints to unregister ScoringFn and Benchmark resources in Llama Stack (#3371)

r3v5 · web-flow · commit ab321739f2e2 · 2025-09-15T12:43:38.000-07:00
# What does this PR do?  This PR provides functionality for users to unregister ScoringFn and Benchmark resources for `scoring` and `eval` APIs.   Closes #3051 ## Test Plan  Updated integration and unit tests via CI workflow
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
@@ -1380,6 +1380,40 @@
                         }
                     }
                 ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "Benchmarks"
+                ],
+                "description": "Unregister a benchmark.",
+                "parameters": [
+                    {
+                        "name": "benchmark_id",
+                        "in": "path",
+                        "description": "The ID of the benchmark to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
             }
         },
         "/v1/openai/v1/chat/completions/{completion_id}": {
@@ -1620,6 +1654,40 @@
                         }
                     }
                 ]
+            },
+            "delete": {
+                "responses": {
+                    "200": {
+                        "description": "OK"
+                    },
+                    "400": {
+                        "$ref": "#/components/responses/BadRequest400"
+                    },
+                    "429": {
+                        "$ref": "#/components/responses/TooManyRequests429"
+                    },
+                    "500": {
+                        "$ref": "#/components/responses/InternalServerError500"
+                    },
+                    "default": {
+                        "$ref": "#/components/responses/DefaultError"
+                    }
+                },
+                "tags": [
+                    "ScoringFunctions"
+                ],
+                "description": "Unregister a scoring function.",
+                "parameters": [
+                    {
+                        "name": "scoring_fn_id",
+                        "in": "path",
+                        "description": "The ID of the scoring function to unregister.",
+                        "required": true,
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                ]
             }
         },
         "/v1/shields/{identifier}": {
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
@@ -954,6 +954,30 @@ paths:
           required: true
           schema:
             type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - Benchmarks
+      description: Unregister a benchmark.
+      parameters:
+        - name: benchmark_id
+          in: path
+          description: The ID of the benchmark to unregister.
+          required: true
+          schema:
+            type: string
   /v1/openai/v1/chat/completions/{completion_id}:
     get:
       responses:
@@ -1119,6 +1143,31 @@ paths:
           required: true
           schema:
             type: string
+    delete:
+      responses:
+        '200':
+          description: OK
+        '400':
+          $ref: '#/components/responses/BadRequest400'
+        '429':
+          $ref: >-
+            #/components/responses/TooManyRequests429
+        '500':
+          $ref: >-
+            #/components/responses/InternalServerError500
+        default:
+          $ref: '#/components/responses/DefaultError'
+      tags:
+        - ScoringFunctions
+      description: Unregister a scoring function.
+      parameters:
+        - name: scoring_fn_id
+          in: path
+          description: >-
+            The ID of the scoring function to unregister.
+          required: true
+          schema:
+            type: string
   /v1/shields/{identifier}:
     get:
       responses:
diff --git a/llama_stack/apis/benchmarks/benchmarks.py b/llama_stack/apis/benchmarks/benchmarks.py
@@ -93,3 +93,11 @@ async def register_benchmark(
         :param metadata: The metadata to use for the benchmark.
         """
         ...
+
+    @webmethod(route="/eval/benchmarks/{benchmark_id}", method="DELETE")
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark.
+
+        :param benchmark_id: The ID of the benchmark to unregister.
+        """
+        ...
diff --git a/llama_stack/apis/scoring_functions/scoring_functions.py b/llama_stack/apis/scoring_functions/scoring_functions.py
@@ -197,3 +197,11 @@ async def register_scoring_function(
         :param params: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
         """
         ...
+
+    @webmethod(route="/scoring-functions/{scoring_fn_id:path}", method="DELETE")
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        """Unregister a scoring function.
+
+        :param scoring_fn_id: The ID of the scoring function to unregister.
+        """
+        ...
diff --git a/llama_stack/core/routing_tables/benchmarks.py b/llama_stack/core/routing_tables/benchmarks.py
@@ -56,3 +56,7 @@ async def register_benchmark(
             provider_resource_id=provider_benchmark_id,
         )
         await self.register_object(benchmark)
+
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        existing_benchmark = await self.get_benchmark(benchmark_id)
+        await self.unregister_object(existing_benchmark)
diff --git a/llama_stack/core/routing_tables/common.py b/llama_stack/core/routing_tables/common.py
@@ -64,6 +64,10 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
         return await p.unregister_shield(obj.identifier)
     elif api == Api.datasetio:
         return await p.unregister_dataset(obj.identifier)
+    elif api == Api.eval:
+        return await p.unregister_benchmark(obj.identifier)
+    elif api == Api.scoring:
+        return await p.unregister_scoring_function(obj.identifier)
     elif api == Api.tool_runtime:
         return await p.unregister_toolgroup(obj.identifier)
     else:
diff --git a/llama_stack/core/routing_tables/scoring_functions.py b/llama_stack/core/routing_tables/scoring_functions.py
@@ -60,3 +60,7 @@ async def register_scoring_function(
         )
         scoring_fn.provider_id = provider_id
         await self.register_object(scoring_fn)
+
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)
+        await self.unregister_object(existing_scoring_fn)
diff --git a/llama_stack/providers/inline/eval/meta_reference/eval.py b/llama_stack/providers/inline/eval/meta_reference/eval.py
@@ -75,6 +75,13 @@ async def register_benchmark(self, task_def: Benchmark) -> None:
         )
         self.benchmarks[task_def.identifier] = task_def
 
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        if benchmark_id in self.benchmarks:
+            del self.benchmarks[benchmark_id]
+
+        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
+        await self.kvstore.delete(key)
+
     async def run_eval(
         self,
         benchmark_id: str,
diff --git a/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
@@ -63,6 +63,9 @@ async def list_scoring_functions(self) -> list[ScoringFn]:
     async def register_scoring_function(self, function_def: ScoringFn) -> None:
         self.llm_as_judge_fn.register_scoring_fn_def(function_def)
 
+    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
+        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
+
     async def score_batch(
         self,
         dataset_id: str,
diff --git a/llama_stack/providers/remote/eval/nvidia/eval.py b/llama_stack/providers/remote/eval/nvidia/eval.py
@@ -51,18 +51,23 @@ async def initialize(self) -> None: ...
 
     async def shutdown(self) -> None: ...
 
-    async def _evaluator_get(self, path):
+    async def _evaluator_get(self, path: str):
         """Helper for making GET requests to the evaluator service."""
         response = requests.get(url=f"{self.config.evaluator_url}{path}")
         response.raise_for_status()
         return response.json()
 
-    async def _evaluator_post(self, path, data):
+    async def _evaluator_post(self, path: str, data: dict[str, Any]):
         """Helper for making POST requests to the evaluator service."""
         response = requests.post(url=f"{self.config.evaluator_url}{path}", json=data)
         response.raise_for_status()
         return response.json()
 
+    async def _evaluator_delete(self, path: str) -> None:
+        """Helper for making DELETE requests to the evaluator service."""
+        response = requests.delete(url=f"{self.config.evaluator_url}{path}")
+        response.raise_for_status()
+
     async def register_benchmark(self, task_def: Benchmark) -> None:
         """Register a benchmark as an evaluation configuration."""
         await self._evaluator_post(
@@ -75,6 +80,10 @@ async def register_benchmark(self, task_def: Benchmark) -> None:
             },
         )
 
+    async def unregister_benchmark(self, benchmark_id: str) -> None:
+        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
+        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
+
     async def run_eval(
         self,
         benchmark_id: str,
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
@@ -9,6 +9,7 @@
 
 import pandas as pd
 import pytest
+import requests
 
 
 @pytest.fixture
@@ -77,7 +78,46 @@ def test_scoring_functions_register(
     assert len(list_response) > 0
     assert any(x.identifier == sample_scoring_fn_id for x in list_response)
 
-    # TODO: add unregister api for scoring functions
+
+def test_scoring_functions_unregister(
+    llama_stack_client,
+    sample_scoring_fn_id,
+    judge_model_id,
+    sample_judge_prompt_template,
+):
+    llm_as_judge_provider = [
+        x
+        for x in llama_stack_client.providers.list()
+        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
+    ]
+    if len(llm_as_judge_provider) == 0:
+        pytest.skip("No llm-as-judge provider found, cannot test unregister")
+
+    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
+
+    # Register first
+    register_scoring_function(
+        llama_stack_client,
+        llm_as_judge_provider_id,
+        sample_scoring_fn_id,
+        judge_model_id,
+        sample_judge_prompt_template,
+    )
+
+    # Ensure it is present
+    list_response = llama_stack_client.scoring_functions.list()
+    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
+
+    # Unregister scoring fn
+    try:
+        base_url = llama_stack_client.base_url
+    except AttributeError:
+        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
+
+    resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30)
+    assert resp.status_code in (200, 204)
+    list_after = llama_stack_client.scoring_functions.list()
+    assert all(x.identifier != sample_scoring_fn_id for x in list_after)
 
 
 @pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
@@ -105,6 +105,9 @@ async def list_scoring_functions(self):
     async def register_scoring_function(self, scoring_fn):
         return scoring_fn
 
+    async def unregister_scoring_function(self, scoring_fn_id: str):
+        return scoring_fn_id
+
 
 class BenchmarksImpl(Impl):
     def __init__(self):
@@ -113,6 +116,9 @@ def __init__(self):
     async def register_benchmark(self, benchmark):
         return benchmark
 
+    async def unregister_benchmark(self, benchmark_id: str):
+        return benchmark_id
+
 
 class ToolGroupsImpl(Impl):
     def __init__(self):
@@ -330,6 +336,13 @@ async def test_scoring_functions_routing_table(cached_disk_dist_registry):
     assert "test-scoring-fn" in scoring_fn_ids
     assert "test-scoring-fn-2" in scoring_fn_ids
 
+    # Unregister scoring functions and verify listing
+    for i in range(len(scoring_functions.data)):
+        await table.unregister_scoring_function(scoring_functions.data[i].scoring_fn_id)
+
+    scoring_functions_list_after_deletion = await table.list_scoring_functions()
+    assert len(scoring_functions_list_after_deletion.data) == 0
+
 
 async def test_benchmarks_routing_table(cached_disk_dist_registry):
     table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
@@ -347,6 +360,15 @@ async def test_benchmarks_routing_table(cached_disk_dist_registry):
     benchmark_ids = {b.identifier for b in benchmarks.data}
     assert "test-benchmark" in benchmark_ids
 
+    # Unregister the benchmark and verify removal
+    await table.unregister_benchmark(benchmark_id="test-benchmark")
+    benchmarks_after = await table.list_benchmarks()
+    assert len(benchmarks_after.data) == 0
+
+    # Unregistering a non-existent benchmark should raise a clear error
+    with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
+        await table.unregister_benchmark(benchmark_id="dummy_benchmark")
+
 
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
     table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
diff --git a/tests/unit/providers/nvidia/test_eval.py b/tests/unit/providers/nvidia/test_eval.py
@@ -52,14 +52,19 @@ def setUp(self):
         self.evaluator_post_patcher = patch(
             "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post"
         )
+        self.evaluator_delete_patcher = patch(
+            "llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_delete"
+        )
 
         self.mock_evaluator_get = self.evaluator_get_patcher.start()
         self.mock_evaluator_post = self.evaluator_post_patcher.start()
+        self.mock_evaluator_delete = self.evaluator_delete_patcher.start()
 
     def tearDown(self):
         """Clean up after each test."""
         self.evaluator_get_patcher.stop()
         self.evaluator_post_patcher.stop()
+        self.evaluator_delete_patcher.stop()
 
     def _assert_request_body(self, expected_json):
         """Helper method to verify request body in Evaluator POST request is correct"""
@@ -115,6 +120,13 @@ def test_register_benchmark(self):
         self.mock_evaluator_post.assert_called_once()
         self._assert_request_body({"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config})
 
+    def test_unregister_benchmark(self):
+        # Unregister the benchmark
+        self.run_async(self.eval_impl.unregister_benchmark(benchmark_id=MOCK_BENCHMARK_ID))
+
+        # Verify the Evaluator API was called correctly
+        self.mock_evaluator_delete.assert_called_once_with(f"/v1/evaluation/configs/nvidia/{MOCK_BENCHMARK_ID}")
+
     def test_run_eval(self):
         benchmark_config = BenchmarkConfig(
             eval_candidate=ModelCandidate(

Original file line number	Diff line number	Diff line change
`@@ -56,3 +56,7 @@ async def register_benchmark(`
`56`	`56`	`provider_resource_id=provider_benchmark_id,`
`57`	`57`	`)`
`58`	`58`	`await self.register_object(benchmark)`
	`59`	`+`
	`60`	`+ async def unregister_benchmark(self, benchmark_id: str) -> None:`
	`61`	`+ existing_benchmark = await self.get_benchmark(benchmark_id)`
	`62`	`+ await self.unregister_object(existing_benchmark)`
Original file line number	Diff line number	Diff line change
`@@ -60,3 +60,7 @@ async def register_scoring_function(`
`60`	`60`	`)`
`61`	`61`	`scoring_fn.provider_id = provider_id`
`62`	`62`	`await self.register_object(scoring_fn)`
	`63`	`+`
	`64`	`+ async def unregister_scoring_function(self, scoring_fn_id: str) -> None:`
	`65`	`+ existing_scoring_fn = await self.get_scoring_function(scoring_fn_id)`
	`66`	`+ await self.unregister_object(existing_scoring_fn)`