experimental: add new tooling that shows how to create from scratch (#…

…768) This uses a new version of Fuzz Introspector to analyse code that is not in OSS-Fuzz and also uses the prompt generation logic from OSS-Fuzz-gen core. This is an mimimum viable use case in terms of generating harnesses by way of FI and OFG. --------- Signed-off-by: David Korczynski <[email protected]>
google · Jan 27, 2025 · ee94805 · ee94805
1 parent db10eac
commit ee94805
Show file tree

Hide file tree

Showing 3 changed files with 181 additions and 0 deletions.
diff --git a/experimental/from_scratch/README.md b/experimental/from_scratch/README.md
@@ -0,0 +1,61 @@
+# Sample tooling for generating harnesses for a codebase without harnesses
+
+
+To run this you need a local version of Fuzz Introspector and a target code
+base you want to analyse.
+
+Sample run where `${MODEL}` holds your model name:
+
+```sh
+# Create virtual environment
+python3.11 -m virtualenv .venv
+. .venv/bin/activate
+
+# Install Fuzz Introspector in virtual environment
+git clone https://github.com/ossf/fuzz-introspector
+cd fuzz-introspector/src
+python3 -m pip install -e .
+cd ../../
+
+
+# Prepare a target
+git clone https://github.com/dvhar/dateparse
+
+# Clone oss-fuzz-gen
+git clone https://github.com/google/oss-fuzz-gen
+cd oss-fuzz-gen
+
+# Generate a harness
+python3 -m experimental.from_scratch.generate \
+  -l ${MODEL} \
+  -f dateparse \
+  -t ../dateparse/
+
+# Show harness
+cat responses/01.rawoutput
+"""
+#include <stdio.h>
+#include <string.h>
+
+typedef struct{int year;int month; int day;} date_t;
+
+int dateparse(const char* datestr, date_t* t, int *offset, int stringlen); // prototype
+
+int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+    date_t t;
+    int offset = 0;
+    
+    // ensure NULL termination for the data string
+    char* datestr = (char*)malloc(size + 1);
+    if (!datestr)
+        return 0;
+    memcpy(datestr, data, size);
+    datestr[size] = '\0';
+    
+    dateparse(datestr, &t, &offset, size);
+
+    free(datestr);
+    return 0;
+}
+"""
+```
diff --git a/experimental/from_scratch/__init__.py b/experimental/from_scratch/__init__.py
diff --git a/experimental/from_scratch/generate.py b/experimental/from_scratch/generate.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python3
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module for generating harnesses in arbitrary projects."""
+
+import argparse
+import os
+import sys
+from typing import Optional
+
+# pyright: reportMissingImports = false
+from fuzz_introspector.frontends import oss_fuzz as fi_oss_fuzz
+
+from experiment import benchmark as benchmarklib
+from llm_toolkit import models, prompt_builder, prompts
+
+NUM_SAMPLES: int = 1
+TEMPERATURE: float = 1
+MAX_TOKENS: int = 8192
+
+
+def parse_args() -> argparse.Namespace:
+  """Parses command line arguments."""
+  parser = argparse.ArgumentParser(
+      description='Run all experiments that evaluates all target functions.')
+  parser.add_argument('-l',
+                      '--model',
+                      default=models.DefaultModel.name,
+                      help=('Models available: '
+                            f'{", ".join(models.LLM.all_llm_names())}'))
+  parser.add_argument('-r',
+                      '--response-dir',
+                      default='./responses',
+                      help='LLM response directory.')
+  parser.add_argument('-f',
+                      '--function',
+                      help='Name of function to generate a target for.',
+                      required=True)
+  parser.add_argument('-t',
+                      '--target-dir',
+                      help='Directory with project source.',
+                      required=True)
+  return parser.parse_args()
+
+
+def setup_model(args) -> models.LLM:
+  return models.LLM.setup(ai_binary='',
+                          name=args.model,
+                          max_tokens=MAX_TOKENS,
+                          num_samples=NUM_SAMPLES,
+                          temperature=TEMPERATURE)
+
+
+def get_target_benchmark(
+    language, target_dir,
+    target_function_name) -> Optional[benchmarklib.Benchmark]:
+  """Run introspector analysis on a target directory and extract benchmark"""
+  project = fi_oss_fuzz.analyse_folder(language=language,
+                                       directory=target_dir,
+                                       dump_output=False)
+  # Trigger some analysis
+  project.dump_module_logic(report_name='', dump_output=False)
+
+  for function in project.all_functions:
+    if function.name == target_function_name:
+      param_list = []
+      for idx, arg_name in function.arg_names:
+        param_list.append({'name': arg_name, 'type': function.arg_types[idx]})
+
+      return benchmarklib.Benchmark(
+          benchmark_id='sample',
+          project='no-name',
+          language=language,
+          function_name=function.name,
+          function_signature=function.sig,
+          return_type=function.return_type,
+          params=param_list,
+          target_path=function.parent_source.source_file)
+  return None
+
+
+def construct_fuzz_prompt(model, benchmark) -> prompts.Prompt:
+  """Local benchmarker"""
+  builder = prompt_builder.DefaultTemplateBuilder(model, benchmark=benchmark)
+  fuzz_prompt = builder.build([])
+  return fuzz_prompt
+
+
+def main():
+  args = parse_args()
+  model = setup_model(args)
+
+  target_benchmark = get_target_benchmark('c++', args.target_dir, args.function)
+  if target_benchmark is None:
+    print('Could not find target function. Exiting.')
+    sys.exit(0)
+
+  fuzz_prompt = construct_fuzz_prompt(model, target_benchmark)
+  os.makedirs(args.response_dir, exist_ok=True)
+  print('Querying with the prompt')
+  print('-' * 40)
+  print(fuzz_prompt.get())
+  print('-' * 40)
+  print(f'Running query and writing results in {args.reponse_dir}')
+  model.query_llm(fuzz_prompt, response_dir=args.response_dir)
+
+
+if __name__ == "__main__":
+  main()