feat: LLM-based evaluators return meta info from OpenAI (#7947)

* LLM-Evaluator returns metadata from OpenAI * adding tests * adding release notes * updating test * updating release notes * fixing live tests * attending PR comments * fixing tests * Update releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml Co-authored-by: Stefano Fiorucci <[email protected]> * Update llm_evaluator.py --------- Co-authored-by: Stefano Fiorucci <[email protected]>
deepset-ai · Jul 2, 2024 · 1865124 · 1865124
1 parent 3068ea2
commit 1865124
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 3 deletions.
diff --git a/haystack/components/evaluators/llm_evaluator.py b/haystack/components/evaluators/llm_evaluator.py
@@ -171,10 +171,12 @@ def run(self, **inputs) -> Dict[str, Any]:
         :param inputs:
             The input values to evaluate. The keys are the input names and the values are lists of input values.
         :returns:
-            A dictionary with a single `results` entry that contains a list of results.
+            A dictionary with a `results` entry that contains a list of results.
             Each result is a dictionary containing the keys as defined in the `outputs` parameter of the LLMEvaluator
             and the evaluation results as the values. If an exception occurs for a particular input value, the result
             will be `None` for that entry.
+            If the API is "openai" and the response contains a "meta" key, the metadata from OpenAI will be included
+            in the output dictionary, under the key "meta".
         :raises ValueError:
             Only in the case that  `raise_on_failure` is set to True and the received inputs are not lists or have
             different lengths, or if the output is not a valid JSON or doesn't contain the expected keys.
@@ -187,6 +189,7 @@ def run(self, **inputs) -> Dict[str, Any]:
         list_of_input_names_to_values = [dict(zip(input_names, v)) for v in values]
 
         results: List[Optional[Dict[str, Any]]] = []
+        metadata = None
         errors = 0
         for input_names_to_values in tqdm(list_of_input_names_to_values, disable=not self.progress_bar):
             prompt = self.builder.run(**input_names_to_values)
@@ -208,11 +211,14 @@ def run(self, **inputs) -> Dict[str, Any]:
                 results.append(None)
                 errors += 1
 
+            if self.api == "openai" and "meta" in result:
+                metadata = result["meta"]
+
         if errors > 0:
             msg = f"LLM evaluator failed for {errors} out of {len(list_of_input_names_to_values)} inputs."
             warn(msg)
 
-        return {"results": results}
+        return {"results": results, "meta": metadata}
 
     def prepare_template(self) -> str:
         """

diff --git a/releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml b/releasenotes/notes/adding-metadata-info-from-OpenAI-f5309af5f59bb6a7.yaml
@@ -0,0 +1,5 @@
+---
+
+enhancements:
+  - |
+    When using "openai" for the LLM-based evaluators the metadata from OpenAI will be in the output dictionary, under the key "meta".
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -160,6 +160,7 @@ def generator_run(self, *args, **kwargs):
                 {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
             ],
             "score": 0.75,
+            "meta": None,
         }
 
     def test_run_no_statements_extracted(self, monkeypatch):
@@ -192,6 +193,7 @@ def generator_run(self, *args, **kwargs):
                 {"score": 0, "statement_scores": [], "statements": []},
             ],
             "score": 0.25,
+            "meta": None,
         }
 
     def test_run_missing_parameters(self, monkeypatch):
@@ -256,6 +258,11 @@ def test_live_run(self):
         nested_required_fields = {"score", "statement_scores", "statements"}
         assert all(field in result["results"][0] for field in nested_required_fields)
 
+        assert "meta" in result
+        assert "prompt_tokens" in result["meta"][0]["usage"]
+        assert "completion_tokens" in result["meta"][0]["usage"]
+        assert "total_tokens" in result["meta"][0]["usage"]
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -179,6 +179,7 @@ def generator_run(self, *args, **kwargs):
                 {"score": 1, "statement_scores": [1, 1], "statements": ["c", "d"]},
             ],
             "score": 0.75,
+            "meta": None,
         }
 
     def test_run_no_statements_extracted(self, monkeypatch):
@@ -215,6 +216,7 @@ def generator_run(self, *args, **kwargs):
                 {"score": 0, "statement_scores": [], "statements": []},
             ],
             "score": 0.25,
+            "meta": None,
         }
 
     def test_run_missing_parameters(self, monkeypatch):
@@ -282,3 +284,9 @@ def test_live_run(self):
         assert all(field in result for field in required_fields)
         nested_required_fields = {"score", "statement_scores", "statements"}
         assert all(field in result["results"][0] for field in nested_required_fields)
+
+        # assert that metadata is present in the result
+        assert "meta" in result
+        assert "prompt_tokens" in result["meta"][0]["usage"]
+        assert "completion_tokens" in result["meta"][0]["usage"]
+        assert "total_tokens" in result["meta"][0]["usage"]
diff --git a/test/components/evaluators/test_llm_evaluator.py b/test/components/evaluators/test_llm_evaluator.py
@@ -339,7 +339,7 @@ def generator_run(self, *args, **kwargs):
         monkeypatch.setattr("haystack.components.generators.openai.OpenAIGenerator.run", generator_run)
 
         results = component.run(questions=["What is the capital of Germany?"], predicted_answers=["Berlin"])
-        assert results == {"results": [{"score": 0.5}]}
+        assert results == {"results": [{"score": 0.5}], "meta": None}
 
     def test_prepare_template(self, monkeypatch):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")