Fix Reported Bugs in the Crash Analyzer (#1154)

AmPaschal · web-flow · commit bca17b03aa76 · 2025-08-07T09:24:35.000+10:00
This PR addresses the three errors reported in [Issue 1153](#1153). ## Error 1: GDB complains that it cannot insert breakpoint because it cannot access memory at the breakpoint's address. To address this error, this PR modifies the oss-fuzz-checkout.prepare_project_image function to use the oss-fuzz project created by the crash analyzer using the Evaluator.create_ossfuzz_project_with_gdb function. In addition, this PR also disables caching by default. This is because when a cached project image is used, OSS-Fuzz-Gen edit the Dockerfile in the created oss-fuzz project and removes the commands modifying environment variables. A cleaner solution would be to also modify the compile flags in the cached image but I couldn't get this to work within the short time I worked on this. ## Error 2: GDB complains that the artifact directory was not found. Using the oss-fuzz project created using the Evaluator.create_ossfuzz_project_with_gdb function partially addresses this error. I also modified the tutorial of the GDB tool so that it references the correct path of the artifact in the project container used by the GDB tool. ## Error 3: The LLM hallucinates GDB interactions and uses this hallucinated interaction to derive its conclusion. Since I cannot prevent the LLM from hallucinating, I addressed this error by adding two validations to the LLM response. First, I added a check that the LLM response does not contain both gdb/bash commands and the LLM's conclusion. The reasoning is that the LLM should not be issuing tool commands and providing a conclusion at the same time. In the future, I expect that the use of function tools should also address this problem. I also added a second validation that ensures the gdb tool is used at least once before the LLM produces a response. I'm not sure if this can prevent hallucination, but from my experience, once the LLM uses the GDB tool the first time, it continues using the GDB tool. Finally, this PR also fixes a bug in the Crash Analyzer where the ProjectContainer tool is not terminated before the Crash Analyzer exits and adds necessary files for testing the Crash Analyzer directly. ## Evaluation Instruction The Crash Analyzer (before and after the changes provided by this PR) can be tested directly using the command below (change the oss-fuzz directory path): ``` python3 -m agent_tests.agent_test -y benchmark-sets/comparison/mosh.yaml -f _ZN8Terminal11Framebuffer6resizeEii -p CrashAnalyzer -pf agent_tests/prompt_files/crash-analyzer-mosh-01.txt -afp ./agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/ -of [path/to/oss-fuzz] > result-test-01.txt 2>&1 ``` ## Expectation Without the contributed changes, the LLM response will exhibit one of the three errors described in [Issue 1153](#1153). With the contributed changes, the LLM should use the GDB tool at least once, and the GDB tool invocation should not fail.
diff --git a/agent/crash_analyzer.py b/agent/crash_analyzer.py
@@ -130,12 +130,27 @@ def _container_handle_conclusion(self, cur_round: int, response: str,
   def _container_tool_reaction(self, cur_round: int, response: str,
                                crash_result: CrashResult) -> Optional[Prompt]:
     """Validates LLM conclusion or executes its command."""
+    extra_note = ''
+    # If there's a conclusion tag and a tool usage tag, then there's an error
+    prompt = prompt_builder.CrashAnalyzerTemplateBuilder(self.llm,
+                                                         None).build([])
+    if self._parse_tag(response, 'conclusion') and (self._parse_tag(
+        response, 'gdb') or self._parse_tag(response, 'bash')):
+      extra_note = 'NOTE: You cannot provide both tool commands and conclusion in the same response.'
+      return self._container_handle_invalid_tool_usage(
+          [self.gdb_tool, self.bash_tool], cur_round, response, prompt,
+          extra_note)
+
     if self._parse_tag(response, 'conclusion'):
+      if not self.gdb_tool_used:
+        extra_note = 'NOTE: You MUST use the provided GDB tool to analyze the crash before providing a conclusion.'
+        return self._container_handle_invalid_tool_usage(
+            [self.gdb_tool, self.bash_tool], cur_round, response, prompt,
+            extra_note)
       return self._container_handle_conclusion(cur_round, response,
                                                crash_result)
-    prompt = prompt_builder.CrashAnalyzerTemplateBuilder(self.llm,
-                                                         None).build([])
     if self._parse_tag(response, 'gdb'):
+      self.gdb_tool_used = True
       return self._container_handle_gdb_command(response, self.gdb_tool, prompt)
     if self._parse_tag(response, 'bash'):
       return self._container_handle_bash_command(response, self.bash_tool,
@@ -201,6 +216,8 @@ def execute(self, result_history: list[Result]) -> AnalysisResult:
     self.gdb_tool.execute(f'screen -dmS gdb_session -L '
                           f'-Logfile /tmp/gdb_log.txt '
                           f'gdb /out/{last_result.benchmark.target_name}')
+    # Define variable to keep track of gdb tool usage
+    self.gdb_tool_used = False
     self.bash_tool = ProjectContainerTool(
         benchmark, name='check', project_name=generated_oss_fuzz_project)
     self.bash_tool.compile(extra_commands=' && rm -rf /out/* > /dev/null')
@@ -227,10 +244,12 @@ def execute(self, result_history: list[Result]) -> AnalysisResult:
         self._sleep_random_duration(trial=self.trial)
     finally:
       # Cleanup: stop the container
-      logger.debug('Stopping the crash analyze container %s',
+      logger.debug('Stopping the crash analyze containers: %s, %s',
                    self.gdb_tool.container_id,
+                   self.bash_tool.container_id,
                    trial=self.trial)
       self.gdb_tool.terminate()
+      self.bash_tool.terminate()
 
     analysis_result = AnalysisResult(
         author=self,
diff --git a/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/10.build_script b/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/10.build_script
diff --git a/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/10.fuzz_target b/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/10.fuzz_target
@@ -0,0 +1,31 @@
+#include <fuzzer/FuzzedDataProvider.h>
+#include <cstddef>
+#include <cstdint>
+
+// Per instruction, include this specific header.
+#include "/usr/include/c++/9/bits/basic_string.h"
+
+// Headers for the classes under test.
+#include "src/terminal/terminal.h"
+#include "src/terminal/parseraction.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  FuzzedDataProvider provider(data, size);
+
+  // Use reasonable limits to avoid excessive memory allocation which would cause timeouts.
+  const size_t init_width = provider.ConsumeIntegralInRange<size_t>(1, 1024);
+  const size_t init_height = provider.ConsumeIntegralInRange<size_t>(1, 1024);
+
+  Terminal::Emulator emulator(init_width, init_height);
+
+  const size_t resize_width = provider.ConsumeIntegralInRange<size_t>(0, 1024);
+  const size_t resize_height = provider.ConsumeIntegralInRange<size_t>(0, 1024);
+
+  // Create a Resize action object, which is a friend of Emulator.
+  const Parser::Resize resize_action(resize_width, resize_height);
+
+  // Call the public method that will, in turn, call the private resize method.
+  resize_action.act_on_terminal(&emulator);
+
+  return 0;
+}
diff --git a/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/crash-da39a3ee5e6b4b0d3255bfef95601890afd80709 b/agent_tests/2025-07-16-1148-pamusuo-analyzer-tests-1/crash-da39a3ee5e6b4b0d3255bfef95601890afd80709
diff --git a/agent_tests/agent_test.py b/agent_tests/agent_test.py
@@ -192,13 +192,24 @@ def get_result_list_for_agent(
   return agent_test_instance.setup_initial_result_list(benchmark, args.prompt)
 
 
+def json_set_converter(obj):
+  """Converts a set to a list for JSON serialization."""
+  if isinstance(obj, set):
+    return list(obj)
+  raise TypeError(
+      f"Object of type {obj.__class__.__name__} is not JSON serializable")
+
+
 def write_result(args: argparse.Namespace, trial: int,
                  result: List[Result]) -> None:
   """Writes the result to a file in the work directory."""
 
   result_file = os.path.join(args.work_dirs.base, f'{trial}_result.json')
   with open(result_file, 'w') as file:
-    json.dump([r.to_dict() for r in result], file, indent=2)
+    json.dump([r.to_dict() for r in result],
+              file,
+              indent=2,
+              default=json_set_converter)
 
   logger.info('Result written to %s', result_file, trial=trial)
 
diff --git a/agent_tests/prompt_files/crash-analyzer-mosh-01.txt b/agent_tests/prompt_files/crash-analyzer-mosh-01.txt
@@ -0,0 +1,227 @@
+Given the following crash report, fuzz driver code and relevant project function code, analyze the cause of the crash using GDB tool step by step.
+First, make a conclusion, ONLY ANSWER "False" if the crash is caused by bug in fuzz driver OR ONLY ANSWER "True" if the crash is caused by bug in project. Second, offer succinct and to-the-point analyses and suggestions.
+
+Below is crash report:
+<log>
+AddressSanitizer: ABRT on unknown address 0x000000000012 (pc 0x7fbf92cc900b bp 0x7fbf92e3e588 sp 0x7ffce9619330 T0)
+SCARINESS: 10 (signal)
+#0 0x7fbf92cc900b in raise (/lib/x86_64-linux-gnu/libc.so.6+0x4300b) (BuildId: 5792732f783158c66fb4f3756458ca24e46e827d)
+#1 0x7fbf92ca8858 in abort (/lib/x86_64-linux-gnu/libc.so.6+0x22858) (BuildId: 5792732f783158c66fb4f3756458ca24e46e827d)
+#2 0x7fbf92ca8728  (/lib/x86_64-linux-gnu/libc.so.6+0x22728) (BuildId: 5792732f783158c66fb4f3756458ca24e46e827d)
+#3 0x7fbf92cb9fd5 in __assert_fail (/lib/x86_64-linux-gnu/libc.so.6+0x33fd5) (BuildId: 5792732f783158c66fb4f3756458ca24e46e827d)
+#4 0x555a8d236679 in Terminal::Framebuffer::resize(int, int) /src/mosh/src/terminal/terminalframebuffer.cc:398:3
+#5 0x555a8d21fda4 in LLVMFuzzerTestOneInput /src/mosh/src/fuzz/terminal_parser_fuzzer.cc:28:17
+#6 0x555a8d0d4430 in fuzzer::Fuzzer::ExecuteCallback(unsigned char const*, unsigned long) /src/llvm-project/compiler-rt/lib/fuzzer/FuzzerLoop.cpp:614:13
+#7 0x555a8d0d5941 in fuzzer::Fuzzer::ReadAndExecuteSeedCorpora(std::__Fuzzer::vector<fuzzer::SizedFile, std::__Fuzzer::allocator<fuzzer::SizedFile>>&) /src/llvm-project/compiler-rt/lib/fuzzer/FuzzerLoop.cpp:807:3
+#8 0x555a8d0d5ed2 in fuzzer::Fuzzer::Loop(std::__Fuzzer::vector<fuzzer::SizedFile, std::__Fuzzer::allocator<fuzzer::SizedFile>>&) /src/llvm-project/compiler-rt/lib/fuzzer/FuzzerLoop.cpp:867:3
+#9 0x555a8d0c500b in fuzzer::FuzzerDriver(int*, char***, int (*)(unsigned char const*, unsigned long)) /src/llvm-project/compiler-rt/lib/fuzzer/FuzzerDriver.cpp:914:6
+#10 0x555a8d0f03e2 in main /src/llvm-project/compiler-rt/lib/fuzzer/FuzzerMain.cpp:20:10
+#11 0x7fbf92caa082 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x24082) (BuildId: 5792732f783158c66fb4f3756458ca24e46e827d)
+#12 0x555a8d0b788d in _start (out/libfuzzer-address-x86_64/terminal_parser_fuzzer+0x5288d)
+
+DEDUP_TOKEN: raise--abort--
+AddressSanitizer can not provide additional info.
+</log>
+
+Below is driver code:
+<code>
+Line 1 - 28:
+#include <fuzzer/FuzzedDataProvider.h>
+#include <cstddef>
+#include <cstdint>
+
+// Per instruction, include this specific header.
+#include "/usr/include/c++/9/bits/basic_string.h"
+
+// Headers for the classes under test.
+#include "src/terminal/terminal.h"
+#include "src/terminal/parseraction.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+FuzzedDataProvider provider(data, size);
+
+// Use reasonable limits to avoid excessive memory allocation which would cause timeouts.
+const size_t init_width = provider.ConsumeIntegralInRange<size_t>(1, 1024);
+const size_t init_height = provider.ConsumeIntegralInRange<size_t>(1, 1024);
+
+Terminal::Emulator emulator(init_width, init_height);
+
+const size_t resize_width = provider.ConsumeIntegralInRange<size_t>(0, 1024);
+const size_t resize_height = provider.ConsumeIntegralInRange<size_t>(0, 1024);
+
+// Create a Resize action object, which is a friend of Emulator.
+const Parser::Resize resize_action(resize_width, resize_height);
+
+// Call the public method that will, in turn, call the private resize method.
+resize_action.act_on_terminal(&emulator);
+</code>
+
+Below is relevant project function code:
+<code>
+{PROJECT_FUNCTION_CODE}
+</code>
+
+To help analyze the root cause behind the runtime crash, you can leverage GDB tool and BASH tool to obtain information.
+
+Instructions:
+1. ALWAYS use the provided GDB or BASH tools to locate the program lines mentioned in the crash report.
+2. DO NOT TRY TO ANALYZE OR COUNT THE LINES OF CODE IN THE PROGRAM YOURSELF.
+<tool>
+**GDB tool Guide**
+You can leverage GDB by iteractively sending me a GDB command, and I will provide you with the output of the command. The path of fuzz driver binary is '/out/terminal_parser_fuzzer'. The testcase that triggers runtime crash is stored at '/experiment/results/output-mosh-_zn8terminal8emulator6resizeemm/artifacts/10.fuzz_target-F0-10/crash-da39a3ee5e6b4b0d3255bfef95601890afd80709'.
+
+<interaction protocols>
+1. I have executed 'gdb /out/terminal_parser_fuzzer'. You are now in GDB session, NOT in shell session. DO NOT run 'gdb /out/terminal_parser_fuzzer' again! DO NOT run shell commands!
+2. Strictly ONE GDB command at a time!
+3. Each message you send should first explain the reason why you want to run the command wrapped by <reason></reason>, then provide the command to run wrapped in <gdb></gdb> in this format:
+<reason>
+Reasons here.
+</reason>
+<gdb>
+One gdb command here.
+</gdb>
+4. Each reponse I send will repeat the command you sent wrapped in <gdb command></gdb command> for you to double-check, followed by the command standard output wrapped in <gdb output></gdb output> and stderr wrapped in <stderr></stderr> in this format:
+<gdb command>
+The command I executed, copied from the command you sent.
+</gdb command>
+<gdb output>
+The standard output of the command.
+</gdb output>
+<stderr>
+The standard error of the command.
+</stderr>
+5. The final goal is to answer questions about runtime crash, executed fuzz driver and project under test: a) ‘False’(if the crash is caused by bug in fuzz driver) or ‘True'(if the crash is caused by bug in project)? b) If the crash is caused by bug in fuzz driver, provide analyses, and are there any suggestions for modifying the fuzz driver? c) If the crash is caused by bug in project, provide analyses, and are there any suggestions for patching the project?
+6. If you have a conclusion on above questions, output the conclusion wrapped by <conclusion></conclusion> followed by the analysis and suggestion wrapped in <analysis and suggestion></analysis and suggestion>:
+<conclusion>
+‘False’ or ‘True’
+</conclusion>
+<analysis and suggestion>
+Analysis and suggestion
+</analysis and suggestion>
+</interaction protocols>
+
+<general rules>
+1. DO NOT wrap code snippets with ```, using the XML-style tags above will suffice.
+2. DO NOT Compile or Run Code!
+3. Strictly ONE GDB command at a time!
+4. DO NOT run 'gdb /out/terminal_parser_fuzzer' again!
+5. DO NOT run shell commands!
+</general rules>
+</tool>
+<tool>
+**Bash tool Guide**
+Use the bash tool to investigate files in the fuzz target's build environment. This will help you understand the project source code, the function under test, its dependencies, and any compilation requirements.
+
+<interaction protocols>
+1. STRICTLY Only One Bash Command per message:
+* **DO NOT** send multiple bash commands in each message.
+2. Execute Bash Command Message Structure:
+* Reason for the Command:
+* Explain the reason for running the command.
+* Wrap this explanation within <reason> and </reason> tags.
+* Bash Command:
+* Provide the bash command to execute.
+* Wrap the command with <bash> and </bash> tags.
+* Format Example:
+<reason>
+I want to locate the source file containing the definition of the function-under-test to examine its implementation.
+</reason>
+<bash>
+grep -rn 'function_name(' /src/project-name/
+</bash>
+3. Receiving Bash Command Output Message Structure:
+* Bash execution outputs will be returned in the following format:
+<bash>
+[The command you executed.]
+</bash>
+<stdout>
+[Standard output of the command.]
+</stdout>
+<stderr>
+[Standard error of the command.]
+</stderr>
+<interaction protocols>
+
+<general rules>
+1 .File Access and Modification Restrictions:
+* Allowed Actions:
+* View any files and environment variables in the build environment.
+* Prohibited Actions:
+* Do not modify, rename, or create new files.
+* All modifications will not be preserved when building the fuzz target.
+</general rules>
+
+<tool guidelines>
+1 .Purposeful Commands:
+* Each bash command should have a clear purpose related to your investigation toward the final goals.
+2. Careful Interpretation:
+* Analyze the output of each command thoroughly to inform your next steps.
+* Keep notes of important findings that will help in modifying the fuzz target and build script.
+4. Clarity and Compliance:
+* Adhere strictly to the interaction protocols and formatting requirements.
+* Ensure your messages are clear and properly formatted.
+5. No Unauthorized Actions:
+* Do not modify files.
+6. Avoid using `pkg-config`:
+* Use bash commands to manually identify the correct file paths
+* Explore the project's directory hierarchy (`/src/<project-name>`) to learn headerfiles locations, library's naming conventions, and build system.
+</tool guidelines>
+
+<example usages>
+Command 1. Start by locating the function's definition and understand its parameters, e.g.:
+<reason>
+To find the definition of `my_function` in the project directory and understand its implementation details.
+</reason>
+<bash>
+grep -rn 'my_function(' /src/project/
+</bash>
+Command 2. Identify Required Headers:
+<reason>
+To identify the header files in the project directory that declare `my_function`.
+</reason>
+<bash>
+grep -rn 'my_function' /src/project/ --include=*.h
+</bash>
+Command 3. Locate Custom Type Definitions:
+<reason>
+To find the definition of the custom type `CustomType` used by `my_function`.
+</reason>
+<bash>
+grep -rn 'typedef.*CustomType' /src/project/
+</bash>
+Command 4. Examine Existing Fuzz Targets:
+<reason>
+To see how existing fuzz targets include headers and initialize variables in the `LLVMFuzzerTestOneInput` function.
+</reason>
+<bash>
+cat /src/mosh/src/fuzz/terminal_parser_fuzzer.cc
+</bash>
+* Remember you can use the same command on other example fuzz targets under the same parent directory as `/src/mosh/src/fuzz/terminal_parser_fuzzer.cc`.
+Command 5. Check Build Script for Compilation Flags and Libraries:
+<reason>
+To check which compiler flags and libraries are used in the build script.
+</reason>
+<bash>
+cat /src/build.bk.sh
+</bash>
+Command 6. Verify Available Libraries:
+<reason>
+To list the built libraries to verify that the necessary libraries are available.
+</reason>
+<bash>
+ls /src/project/build/libs/
+</bash>
+Command 7. Understand Environment Variables:
+<reason>
+To check if any environment variables related to the project are set.
+</reason>
+<bash>
+printenv | grep 'PROJECT_VARIABLE'
+</bash>
+</example usages>
+
+<final reminder>
+1. Do Not Compile or Run Code:
+* Your investigation is limited to reading and interpreting information using bash commands.
+</final reminder>
+</tool>
diff --git a/experiment/oss_fuzz_checkout.py b/experiment/oss_fuzz_checkout.py
@@ -32,7 +32,7 @@
 
 BUILD_DIR: str = 'build'
 GLOBAL_TEMP_DIR: str = ''
-ENABLE_CACHING = bool(int(os.getenv('OFG_USE_CACHING', '1')))
+ENABLE_CACHING = bool(int(os.getenv('OFG_USE_CACHING', '0')))
 # Assume OSS-Fuzz is at repo root dir by default.
 # This will change if temp_dir is used.
 OSS_FUZZ_DIR: str = os.path.join(
@@ -436,12 +436,13 @@ def create_ossfuzz_project(benchmark: benchmarklib.Benchmark,
   return generated_project_path
 
 
-def prepare_project_image(benchmark: benchmarklib.Benchmark) -> str:
+def prepare_project_image(benchmark: benchmarklib.Benchmark,
+                          project_name: str = '') -> str:
   """Prepares original image of the |project|'s fuzz target build container."""
   project = benchmark.project
-  image_name = f'gcr.io/oss-fuzz/{project}'
-  generated_oss_fuzz_project = f'{benchmark.id}-{uuid.uuid4().hex}'
+  generated_oss_fuzz_project = project_name or f'{benchmark.id}-{uuid.uuid4().hex}'
   generated_oss_fuzz_project = rectify_docker_tag(generated_oss_fuzz_project)
+  image_name = f'gcr.io/oss-fuzz/{generated_oss_fuzz_project}'
   create_ossfuzz_project(benchmark, generated_oss_fuzz_project)
 
   if not ENABLE_CACHING:
diff --git a/prompts/agent/crash_analyzer-priming.txt b/prompts/agent/crash_analyzer-priming.txt
@@ -17,3 +17,7 @@ Below is relevant project function code:
 </code>
 
 To help analyze the root cause behind the runtime crash, you can leverage GDB tool and BASH tool to obtain information.
+
+Instructions
+1. You MUST use the GDB tool to analyze the crash before making a conclusion.
+2. DO NOT hallucinate the output of the provided tools. You must use the tools and use only results provided by the tools.
diff --git a/tool/container_tool.py b/tool/container_tool.py
@@ -45,7 +45,8 @@ def tutorial(self) -> str:
   def _prepare_project_image(self, project_name: str) -> str:
     """Prepares the project's OSS-Fuzz docker image and returns the image name.
     """
-    image_name = oss_fuzz_checkout.prepare_project_image(self.benchmark)
+    image_name = oss_fuzz_checkout.prepare_project_image(
+        self.benchmark, project_name)
     if image_name:
       return image_name
     raise Exception(f'Failed to build image for {project_name}')
diff --git a/tool/gdb_tool.py b/tool/gdb_tool.py