robusta-dev · nherment · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/.github/workflows/llm-evaluation.yaml b/.github/workflows/llm-evaluation.yaml
@@ -1,6 +1,14 @@
 name: Evaluate LLM test cases
 
-on: [push]
+on:
+  pull_request:
+    branches: ["*"]
+  push:
+    branches: [master]
+
+permissions:
+  pull-requests: write
+  contents: read
 
 jobs:
   build:
@@ -11,10 +19,10 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}
 
@@ -28,8 +36,59 @@ jobs:
           poetry install --no-root
 
       - name: Run tests
+        id: evals
+        continue-on-error: true
         shell: bash
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+          UPLOAD_DATASET: "true"
+          PUSH_EVALS_TO_BRAINTRUST: "true"
+          EXPERIMENT_ID: github-${{ github.run_id }}.${{ github.run_number }}.${{ github.run_attempt }}
         run: |
-          poetry run pytest -m "llm"
+          poetry run pytest tests/llm/test_ask_holmes.py tests/llm/test_investigate.py -n 6
+      - uses: actions/github-script@v7
+        with:
+          retries: 3
+          if: always()
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            try {
+              if(!context.issue || !context.issue.number) {
+                // Only comment on PR if the workflow is run as part of a PR
+                return
+              }
+              const reportContent = fs.readFileSync('evals_report.txt', 'utf8');
+
+              const comments = await github.rest.issues.listComments({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number
+              });
+
+              const botComment = comments.data.find(comment =>
+                comment.user.type === 'Bot' &&
+                comment.body.includes('## Results of HolmesGPT evals')
+              );
+
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: reportContent
+              });
+
+              if (botComment) {
+                await github.rest.issues.deleteComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  comment_id: botComment.id
+                });
+              }
+            } catch(e) {
+              console.log(e)
+            }
+      - name: Check test results
+        if: always() && steps.evals.ourcome != 'success'
+        run: exit 1
diff --git a/.gitignore b/.gitignore
@@ -163,3 +163,4 @@ playwright.png
 pyrightconfig.json
 
 *.AUTOGENERATED
+evals_report.txt
diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -124,7 +124,7 @@ def call(
             logging.debug(f"running iteration {i}")
             # on the last step we don't allow tools - we want to force a reply, not a request to run another tool
             tools = NOT_GIVEN if i == max_steps - 1 else tools
-            tool_choice = NOT_GIVEN if tools == NOT_GIVEN else "auto"
+            tool_choice = None if tools == NOT_GIVEN else "auto"
 
             total_tokens = self.llm.count_tokens_for_message(messages)
             max_context_size = self.llm.get_context_window_size()

diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -12,7 +12,7 @@ In general:
 * if you don't know, say that the analysis was inconclusive.
 * if there are multiple possible causes list them in a numbered list.
 * there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
-* Always check a pod's logs when checking if it is healthy. Don't assume that because the pod is running and reports healthy that it is running without issues.
+* ALWAYS check a pod's logs when checking if it is healthy. A pod in "running" state and reporting healthy does not mean it is running without issues.
 
 If investigating Kubernetes problems:
 * run as many kubectl commands as you need to gather more information, then respond.
@@ -65,10 +65,10 @@ Reminder:
 * Strive for thoroughness and precision, ensuring the issue is fully addressed.
 
 Special cases and how to reply:
-* if you lack tools to access the right data or don't have access to a system, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
-* make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
-* as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
-* that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
+* Tell the user that you are missing an integration to access XYZ if the user asks you about a system or integration you don't know. You should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
+* Make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
+* As a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
+* That is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details
 * Issues are a subset of findings. When asked about an issue or a finding and you have an id, use the tool `fetch_finding_by_id`.
 * For any question, try to make the answer specific to the user's cluster.
 ** For example, if asked to port forward, find out the app or pod port (kubectl decribe) and provide a port forward command specific to the user's question
diff --git a/holmes/plugins/prompts/investigation_output_format.jinja2 b/holmes/plugins/prompts/investigation_output_format.jinja2
@@ -1,13 +1,15 @@
 
 {% if structured_output %}
-Give your answer in a pure JSON format with the following sections. You can set a null value to a section if it's not relevant to the investigation.
+Give your answer in a pure JSON format with the following sections.
 The content of each section should be formatted with markdown:
 
 {
 {% for title, description in sections.items() %}
   "{{ title }}": "{{ description | replace("\n", "\\n") }}",{% endfor %}
 }
 
+You MUST set a section as `null` if it's not relevant to the investigation or it does not contain relevant information.
+
 <DO NOT list tools used and DO NOT add a `Tools` section>
 {% else %}
 Give your answer returning a markdown document structured with the following sections. Use top level markdown titles format `#`.
@@ -19,4 +21,6 @@ Ignore any section that is not relevant to the investigation.
 {% endfor %}
 
 # <DO NOT list tools used and DO NOT add a `# Tools` section>
+
+You MUST ignore a section and skip it if it's not relevant to the investigation or it does not contain relevant information.
 {% endif %}
diff --git a/tests/llm/README.md b/tests/llm/README.md
@@ -39,16 +39,32 @@ The above file requires a manifest.yaml to deploy resources to your kubernetes c
 
 Here are the possible fields in the `test_case.yaml` yaml file:
 
-| Field             | Type             | Required/optional | Example value                                                               | Description                                                                                                                                                                                                            |
-|-------------------|------------------|-------------------|-----------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| user_prompt       | str              | Required          | Is pod xyz healthy?                                                         | The user prompt                                                                                                                                                                                                        |
-| expected_output   | str or List[str] | Required          | Yes, pod xyz is healthy. It is running and there are no errors in the logs. | The expected answer from the LLM. This can be a string or a list of expected elements. If it is a string, the response will be scored with 'faithfulness'. Otherwise it is 'correctness'.                              |
-| retrieval_context | List[str]        | Optional          | - pod xyz is running and healthy - there are no errors in the logs          | Context that the LLM is expected to have used in its answer. If present, this generates a 'context' score proportional to the number of matching context elements found in the LLM's output.                           |
-| evaluation        | Dict[str, float] | Optional          | evaluation: <br/>   faithfulness: 1  <br/>  context: 1  <br/>               | The minimum expected scores. The test will fail unless these are met. Set to 0 for unstable tests.                                                                                                                     |
-| before-test       | str              | Optional          | kubectl apply -f manifest.yaml                                              | A command to run before the LLM evaluation. The CWD for this command is the same folder as the fixture. This step is skipped unless `RUN_LIVE` environment variable is set                                             |
-| after-test        | str              | Optional          | kubectl delete -f manifest.yaml                                             | A command to run after the LLM evaluation.The CWD for this command is the same folder as the fixture. Typically cleans up any before-test action. This step is skipped unless `RUN_LIVE`  environment variable is set  |
-| generate_mocks    | bool             | Optional          | True                                                                        | Whether the test suite should generate mock files. Existing mock files are overwritten.                                                                                                                                |
+| Field             | Type                              | Required/optional | Example value                                                               | Description                                                                                                                                                                                                            |
+|-------------------|-----------------------------------|-------------------|-----------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| user_prompt       | str                               | Required          | Is pod xyz healthy?                                                         | The user prompt                                                                                                                                                                                                        |
+| expected_output   | str or List[str]                  | Required          | Yes, pod xyz is healthy. It is running and there are no errors in the logs. | The expected answer from the LLM. This can be a string or a list of expected elements. If it is a string, the response will be scored with 'faithfulness'. Otherwise it is 'correctness'.                              |
+| retrieval_context | List[str]                         | Optional          | - pod xyz is running and healthy - there are no errors in the logs          | Context that the LLM is expected to have used in its answer. If present, this generates a 'context' score proportional to the number of matching context elements found in the LLM's output.                           |
+| evaluation        | Dict[str, float]                  | Optional          | evaluation: <br/>   faithfulness: 1  <br/>  context: 1  <br/>               | The minimum expected scores. The test will fail unless these are met. Set to 0 for unstable tests.                                                                                                                     |
+| before-test       | str                               | Optional          | kubectl apply -f manifest.yaml                                              | A command to run before the LLM evaluation. The CWD for this command is the same folder as the fixture. This step is skipped unless `RUN_LIVE` environment variable is set                                             |
+| after-test        | str                               | Optional          | kubectl delete -f manifest.yaml                                             | A command to run after the LLM evaluation.The CWD for this command is the same folder as the fixture. Typically cleans up any before-test action. This step is skipped unless `RUN_LIVE`  environment variable is set  |
+| generate_mocks    | bool                              | Optional          | True                                                                        | Whether the test suite should generate mock files. Existing mock files are overwritten.                                                                                                                                |
+| expected_sections | Dict[str, Union[bool, List[str]]] | Optional          | See below...                                                                | A check on the sections returned by the LLM for investigations. This can ensure expected data is present but also that unexpected sections are not present.                                                            |
+
+
+Here is an example of `expected_sections` that expects the `Related logs` section to be
+present and contain `CrashLoopBackOff` but also expects for the `External links` section
+to me missing from the output or set to null/None:
 
+```yaml
+expected_output:
+  - Pod `oomkill-deployment-696dbdbf67-d47z6` is experiencing a `CrashLoopBackOff`
+expected_sections:
+  Related logs:
+    - CrashLoopBackOff
+  External links: False
+evaluation:
+  correctness: 1
+```
 
 #### 3. Run the test
 

diff --git a/tests/llm/conftest.py b/tests/llm/conftest.py
@@ -0,0 +1,84 @@
+import logging
+import os
+import pytest
+from tests.llm.utils.braintrust import get_experiment_results
+from tests.llm.utils.constants import PROJECT
+
+
+def markdown_table(headers, rows):
+    markdown = "| " + " | ".join(headers) + " |\n"
+    markdown += "| " + " | ".join(["---" for _ in headers]) + " |\n"
+    for row in rows:
+        markdown += "| " + " | ".join(str(cell) for cell in row) + " |\n"
+    return markdown
+
+
+@pytest.mark.llm
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    if not os.environ.get("PUSH_EVALS_TO_BRAINTRUST"):
+        # The code fetches the evals from Braintrust to print out a summary.
+        # Skip running it if the evals have not been uploaded to Braintrust
+        return
+
+    headers = ["Test suite", "Test case", "Status"]
+    rows = []
+
+    # Do not change the title below without updating the github workflow that references it
+    markdown = "## Results of HolmesGPT evals\n"
+
+    for test_suite in ["ask_holmes", "investigate"]:
+        try:
+            result = get_experiment_results(PROJECT, test_suite)
+            result.records.sort(key=lambda x: x.get("span_attributes", {}).get("name"))
+            total_test_cases = 0
+            successful_test_cases = 0
+            for record in result.records:
+                scores = record.get("scores", None)
+                span_id = record.get("id")
+                span_attributes = record.get("span_attributes")
+                if scores and span_attributes:
+                    span_name = span_attributes.get("name")
+                    test_case = next(
+                        (tc for tc in result.test_cases if tc.get("id") == span_name),
+                        {},
+                    )
+                    correctness_score = scores.get("correctness", 0)
+                    expected_correctness_score = (
+                        test_case.get("metadata", {})
+                        .get("test_case", {})
+                        .get("evaluation", {})
+                        .get("correctness", 1)
+                    )
+                    total_test_cases += 1
+                    status_text = ":x:"
+                    if correctness_score == 1:
+                        successful_test_cases += 1
+                        status_text = ":white_check_mark:"
+                    elif correctness_score >= expected_correctness_score:
+                        status_text = ":warning:"
+                    rows.append(
+                        [
+                            f"[{test_suite}](https://www.braintrust.dev/app/robustadev/p/HolmesGPT/experiments/{result.experiment_name})",
+                            f"[{span_name}](https://www.braintrust.dev/app/robustadev/p/HolmesGPT/experiments/{result.experiment_name}?r={span_id})",
+                            status_text,
+                        ]
+                    )
+            markdown += f"\n- [{test_suite}](https://www.braintrust.dev/app/robustadev/p/HolmesGPT/experiments/{result.experiment_name}): {successful_test_cases}/{total_test_cases} test cases were successful"
+
+        except ValueError:
+            logging.info(
+                f"Failed to fetch braintrust experiment {PROJECT}-{test_suite}"
+            )
+
+    if len(rows) > 0:
+        markdown += "\n\n"
+        markdown += markdown_table(headers, rows)
+        markdown += "\n\n**Legend**\n"
+        markdown += "\n- :white_check_mark: the test was successful"
+        markdown += (
+            "\n- :warning: the test failed but is known to be flakky or known to fail"
+        )
+        markdown += "\n- :x: the test failed and should be fixed before merging the PR"
+
+        with open("evals_report.txt", "w", encoding="utf-8") as file:
+            file.write(markdown)
diff --git a/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml
@@ -1,7 +1,7 @@
 user_prompt: 'what is the command to port-forward to << { "type": "pod", "name": "my_grafana_4j981" } >>'
 #user_prompt: "what is the command to port-forward to my grafana service?"
 expected_output:
-  - kubectl port-forward pod/my_grafana_4j981 3000:3000 -n default
+  - kubectl port-forward pod/my_grafana_4j981 <any port>:3000 -n default
 retrieval_context:
   - "The grafana service is running but does not have an external IP address"
   - "The name of the grafana service is my_grafana_4j981"

diff --git a/tests/llm/fixtures/test_ask_holmes/06_explain_issue/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/06_explain_issue/test_case.yaml
@@ -1,8 +1,6 @@
 user_prompt: 'Explain << { "type": "issue", "id": "asodfkq1209edyhqawdo2uydqawidh" } >>'
 expected_output:
-  - The pod `my-cluster-kafka-0` in the `kafka` namespace was evicted due to low ephemeral storage on the node
-  - The threshold was 2126213140, but only 1884540Ki was available
-  - The `kafka` container was using 3264Ki of ephemeral storage
+  - low ephemeral storage on the node
 retrieval_context:
   - "Pod my-cluster-kafka was evicted"
   - "The node was low on resource: ephemeral-storage. Threshold quantity: 2126213140, available: 1884540Ki. Container kafka was using 3264Ki, request is 0, has larger consumption of ephemeral-storage."

diff --git a/...07_high_latency/kubectl_find_resource.txt → ...ency/kubectl_find_resource_deployment.txt b/...07_high_latency/kubectl_find_resource.txt → ...ency/kubectl_find_resource_deployment.txt
Original file line number	Diff line number	Diff line change
Expand Up		@@ -163,3 +163,4 @@ playwright.png
		pyrightconfig.json

		*.AUTOGENERATED
		evals_report.txt