Skip to content

Commit 8881de0

Browse files
authored
An Framework for running Individual OSS-Fuzz-Gen Agents (#1143)
This PR contributes the following: - An extendable framework (agent_test.py and a BaseAgentTest abstract class) to enable running individual agents or sequence of agents. - Implementations of BaseAgentTest for Function Analyzer, Context Analyzer and Crash Analyzer agents to enable setting up initial result list for these agents. - a README file describing how this framework can be used or extended.
1 parent c584312 commit 8881de0

13 files changed

+1534
-163
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,16 @@ The experiment included [1300+ benchmarks](./benchmark-sets/all) from 297 open-s
3737
Overall, this framework manages to successfully leverage LLMs to generate valid fuzz targets (which generate non-zero coverage increase)
3838
for 160 C/C++ projects. The maximum line coverage increase is 29% from the existing human-written targets.
3939

40-
Note that these reports are not public as they may contain undisclosed vulnerabilities.
40+
Note that these reports are not public as they may contain undisclosed vulnerabilities.
4141

4242
## Usage
4343

4444
Check our detailed [usage guide](./USAGE.md) for instructions on how to run this framework and generate reports based on the results.
4545

46+
## Independent Agent Execution and Evaluation
47+
You can also execute or evaluate individual agents without running full experiments, using the integrated agent execution framework.
48+
See the [framework's documentation](./agent_tests/readme.md) for detailed instructions on how to run individual agents or sequence of agents.
49+
4650
## Collaborations
4751
Interested in research or open-source community collaborations?
4852
Please feel free to create an issue or email us: [email protected].
@@ -114,7 +118,7 @@ These bugs could only have been discovered with newly generated targets. They we
114118
| libunwind | 30.58% | 83.25% | 2899 | 1342 | 1612 | 4388 |
115119
| openh264 | 30.07% | 50.14% | 6607 | 5751 | 11470 | 19123 |
116120

117-
\* "Total project lines" measures the source code of the project-under-test compiled and linked by the preexisting human-written fuzz targets from OSS-Fuzz.
121+
\* "Total project lines" measures the source code of the project-under-test compiled and linked by the preexisting human-written fuzz targets from OSS-Fuzz.
118122

119123
\* "Total coverage gain" is calculated using a denominator of the "Total project lines". "Total relative gain" is the increase in coverage compared to the old number of covered lines.
120124

agent/function_analyzer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def __init__(self,
4141
trial: int,
4242
llm: models.LLM,
4343
args: argparse.Namespace,
44-
benchmark: benchmarklib.Benchmark,\
44+
benchmark: benchmarklib.Benchmark,
4545
name: str = ''):
4646

4747
builder = prompt_builder.FunctionAnalyzerTemplateBuilder(llm, benchmark)

agent_tests/agent_test.py

Lines changed: 269 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""A test for the function analyzer agent."""
15+
16+
import argparse
17+
import json
18+
import os
19+
import traceback
20+
from datetime import datetime
21+
from typing import List, Tuple, Type
22+
23+
import logger
24+
import run_one_experiment
25+
from agent import (base_agent, context_analyzer, crash_analyzer,
26+
function_analyzer, prototyper)
27+
from agent_tests import (base_agent_test, context_analyzer_test,
28+
crash_analyzer_test, function_analyzer_test)
29+
from data_prep import introspector
30+
from experiment import benchmark as benchmarklib
31+
from experiment import workdir
32+
from llm_toolkit import models
33+
from results import AnalysisResult, BuildResult, CrashResult, Result, RunResult
34+
from stage import base_stage, execution_stage
35+
36+
RESULTS_DIR = f'./results-{datetime.now().strftime("%Y-%m-%d-%H-%M")}'
37+
38+
NUM_ANA = int(os.getenv('LLM_NUM_ANA', '2'))
39+
RUN_TIMEOUT: int = 300
40+
41+
agents: dict[str,
42+
Tuple[Type[base_agent.BaseAgent | base_stage.BaseStage],
43+
Type[base_agent_test.BaseAgentTest]]] = {
44+
'ContextAnalyzer':
45+
(context_analyzer.ContextAnalyzer,
46+
context_analyzer_test.ContextAnalyzerAgentTest),
47+
'CrashAnalyzer':
48+
(crash_analyzer.CrashAnalyzer,
49+
crash_analyzer_test.CrashAnalyzerAgentTest),
50+
'FunctionAnalyzer':
51+
(function_analyzer.FunctionAnalyzer,
52+
function_analyzer_test.FunctionAnalyzerAgentTest),
53+
'Prototyper': (prototyper.Prototyper,
54+
base_agent_test.BaseAgentTest),
55+
'ExecutionStage': (execution_stage.ExecutionStage,
56+
base_agent_test.BaseAgentTest)
57+
}
58+
59+
60+
def parse_args() -> argparse.Namespace:
61+
"""Parses command line arguments."""
62+
parser = argparse.ArgumentParser(
63+
description='Evaluate the function analyzer agent.')
64+
65+
parser.add_argument('-y',
66+
'--benchmark-yaml',
67+
type=str,
68+
required=True,
69+
help='A benchmark YAML file.')
70+
71+
parser.add_argument('-f',
72+
'--function-name',
73+
type=str,
74+
required=True,
75+
help='The function name to analyze.')
76+
77+
parser.add_argument('-p',
78+
'--pipeline',
79+
type=str,
80+
required=True,
81+
help='Comma-separated list of agent names for testing.')
82+
83+
parser.add_argument(
84+
'-pf',
85+
'--prompt-file',
86+
type=str,
87+
default='',
88+
help='A file containing the prompt to reconstruct for initial agent.')
89+
90+
parser.add_argument('-npf',
91+
'--no-prompt-file',
92+
action='store_true',
93+
help='Skip using prompt file even if provided.')
94+
95+
parser.add_argument(
96+
'-afp',
97+
'--additional-files-path',
98+
type=str,
99+
default='',
100+
help=
101+
'The path to a directory containing any additional files needed by the agents under test.'
102+
)
103+
104+
parser.add_argument('-mr',
105+
'--max-round',
106+
type=int,
107+
default=100,
108+
help='Max trial round for agents.')
109+
110+
parser.add_argument('-e',
111+
'--introspector-endpoint',
112+
type=str,
113+
default=introspector.DEFAULT_INTROSPECTOR_ENDPOINT)
114+
115+
parser.add_argument('-c',
116+
'--cloud-experiment-name',
117+
type=str,
118+
default='',
119+
help='The name of the cloud experiment.')
120+
parser.add_argument('-cb',
121+
'--cloud-experiment-bucket',
122+
type=str,
123+
default='',
124+
help='A gcloud bucket to store experiment files.')
125+
parser.add_argument('--context',
126+
action='store_true',
127+
default=False,
128+
help='Add context to function under test.')
129+
parser.add_argument('-to', '--run-timeout', type=int, default=RUN_TIMEOUT)
130+
131+
parser.add_argument(
132+
'-of',
133+
'--oss-fuzz-dir',
134+
help='OSS-Fuzz dir path to use. Create temporary directory by default.',
135+
default='')
136+
137+
parser.add_argument('-w', '--work-dir', default=RESULTS_DIR)
138+
139+
parsed_args = parser.parse_args()
140+
141+
if not parsed_args.benchmark_yaml.endswith('.yaml') or not os.path.isfile(
142+
parsed_args.benchmark_yaml):
143+
raise ValueError('Benchmark YAML file must be a valid .yaml file.')
144+
145+
if not parsed_args.no_prompt_file:
146+
if not os.path.isfile(parsed_args.prompt_file):
147+
raise ValueError('Prompt file must be a valid file.')
148+
with open(parsed_args.prompt_file, 'r') as file:
149+
prompt_content = file.read()
150+
if not prompt_content.strip():
151+
raise ValueError('Prompt file cannot be empty.')
152+
parsed_args.prompt = prompt_content.strip()
153+
else:
154+
parsed_args.prompt = ''
155+
156+
return parsed_args
157+
158+
159+
def get_test_pipeline(
160+
agents_text: str
161+
) -> List[Tuple[Type[base_agent.BaseAgent | base_stage.BaseStage],
162+
Type[base_agent_test.BaseAgentTest]]]:
163+
"""Returns a pipeline of agents for testing."""
164+
165+
agent_list = agents_text.strip().split(',')
166+
pipeline = []
167+
for agent_name in agent_list:
168+
if agent_name not in agents:
169+
raise ValueError(
170+
f'Agent {agent_name} is not defined in the agents dictionary.')
171+
pipeline.append(agents[agent_name])
172+
if not pipeline:
173+
raise ValueError(
174+
'No agents found in the pipeline. Please provide a valid agent list.')
175+
return pipeline
176+
177+
178+
def get_result_list_for_agent(
179+
args: argparse.Namespace,
180+
agent_class: Tuple[Type[base_agent.BaseAgent | base_stage.BaseStage],
181+
Type[base_agent_test.BaseAgentTest]],
182+
benchmark: benchmarklib.Benchmark) -> List[Result]:
183+
"""Returns the initial result list for the agent."""
184+
185+
agent_test_class = agent_class[1]
186+
# Ensure agent_test_class is a subclass of BaseAgentTest
187+
if not issubclass(agent_test_class, base_agent_test.BaseAgentTest):
188+
raise TypeError(
189+
f"{agent_test_class.__name__} is not a subclass of BaseAgentTest")
190+
191+
agent_test_instance = agent_test_class(args, trial=1)
192+
return agent_test_instance.setup_initial_result_list(benchmark, args.prompt)
193+
194+
195+
def write_result(args: argparse.Namespace, trial: int,
196+
result: List[Result]) -> None:
197+
"""Writes the result to a file in the work directory."""
198+
199+
result_file = os.path.join(args.work_dirs.base, f'{trial}_result.json')
200+
with open(result_file, 'w') as file:
201+
json.dump([r.to_dict() for r in result], file, indent=2)
202+
203+
logger.info('Result written to %s', result_file, trial=trial)
204+
205+
206+
if __name__ == '__main__':
207+
208+
model = models.LLM.setup(ai_binary='', name='vertex_ai_gemini-2-5-pro-chat')
209+
210+
args = parse_args()
211+
212+
introspector.set_introspector_endpoints(args.introspector_endpoint)
213+
214+
run_one_experiment.prepare(args.oss_fuzz_dir)
215+
216+
# Initialize test benchmark
217+
benchmarks = benchmarklib.Benchmark.from_yaml(args.benchmark_yaml)
218+
219+
test_benchmark = [
220+
benchmark for benchmark in benchmarks
221+
if benchmark.function_name == args.function_name
222+
]
223+
224+
if not test_benchmark:
225+
raise ValueError(f'No benchmark found for function {args.function_name}.')
226+
227+
benchmark = test_benchmark[0]
228+
229+
# Initialize the working directory
230+
args.work_dirs = workdir.WorkDirs(
231+
os.path.join(args.work_dir, f'output-{benchmark.id}'))
232+
233+
pipeline = get_test_pipeline(args.pipeline)
234+
235+
args.trial = 1
236+
237+
result_list = get_result_list_for_agent(args, pipeline[0], benchmark)
238+
239+
result = None
240+
241+
try:
242+
243+
for agent_class in pipeline:
244+
if issubclass(agent_class[0], base_agent.BaseAgent):
245+
agent_instance = agent_class[0](args.trial, model, args, benchmark)
246+
result = agent_instance.execute(result_list)
247+
elif issubclass(agent_class[0], base_stage.BaseStage):
248+
agent_instance = agent_class[0](args, args.trial)
249+
result = agent_instance.execute(result_list, 0)
250+
else:
251+
raise TypeError(
252+
f"Unexpected agent class type: {agent_class[0].__name__}")
253+
254+
# Prepare for the next agent in the pipeline
255+
result_list.append(result)
256+
257+
if isinstance(result, BuildResult):
258+
logger.get_trial_logger(trial=args.trial).write_fuzz_target(result)
259+
logger.get_trial_logger(trial=args.trial).write_build_script(result)
260+
261+
if result_list:
262+
# Write the final result to a file
263+
write_result(args, args.trial, result_list)
264+
265+
except Exception as e:
266+
logger.error('An error occurred during the agent execution: %s',
267+
str(e),
268+
trial=args.trial)
269+
logger.error('Traceback: %s', traceback.format_exc(), trial=args.trial)

agent_tests/base_agent_test.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Base class for agent tests."""
15+
import re
16+
17+
import logger
18+
19+
20+
class BaseAgentTest:
21+
"""Base class for agent tests, providing common setup and utility methods."""
22+
23+
def __init__(self, args, trial):
24+
self.args = args
25+
self.trial = trial
26+
27+
def _parse_tag(self, response: str, tag: str) -> str:
28+
"""Parses the XML-style tags from LLM response."""
29+
match = re.search(rf'<{tag}>(.*?)</{tag}>', response, re.DOTALL)
30+
return match.group(1).strip() if match else ''
31+
32+
def write_requirements_to_file(self, args, requirements: str) -> str:
33+
"""Write the requirements to a file."""
34+
if not requirements:
35+
logger.warning('No requirements to write to file.', trial=self.trial)
36+
return ''
37+
38+
requirement_path = args.work_dirs.requirements_file_path(self.trial)
39+
40+
with open(requirement_path, 'w') as f:
41+
f.write(requirements)
42+
43+
logger.info('Requirements written to %s',
44+
requirement_path,
45+
trial=self.trial)
46+
47+
return requirement_path
48+
49+
def setup_initial_result_list(self, benchmark, prompt):
50+
"""Sets up the initial result list for the agent test."""
51+
# Load the benchmark and prompt file
52+
raise NotImplementedError(
53+
"This method should be implemented in subclasses to set up the initial result list."
54+
)

0 commit comments

Comments
 (0)