From 4e3798ef7a6af994d7e09094cd399876e701d1a2 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Wed, 17 Jul 2024 23:02:19 +0100 Subject: [PATCH 1/2] experimental: delete docker images after runs (#497) otherwise the storage is getting eaten too fast --------- Signed-off-by: David Korczynski --- experimental/c-cpp/runner.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/experimental/c-cpp/runner.py b/experimental/c-cpp/runner.py index e27198c93c..d2e3268bef 100644 --- a/experimental/c-cpp/runner.py +++ b/experimental/c-cpp/runner.py @@ -251,6 +251,15 @@ def run_on_targets(target, build_heuristics=build_heuristics, generator_heuristics=generator_heuristics) + # Cleanup the OSS-Fuzz docker image + clean_up_cmd = [ + 'docker', 'image', 'rm', f'gcr.io/oss-fuzz/{worker_project_name}' + ] + try: + subprocess.check_call(' '.join(clean_up_cmd), shell=True) + except subprocess.CalledProcessError: + pass + if semaphore is not None: semaphore.release() From 4309defce32e53b9ea237d868ded3436e88fab73 Mon Sep 17 00:00:00 2001 From: DavidKorczynski Date: Thu, 18 Jul 2024 11:34:46 +0100 Subject: [PATCH 2/2] experimental: add test-to-harness conversion logic (#495) Adds a fuzz harness heuristic that relies on converting existing tests. At this stage, it's done without relying on FI, we simply (1) find tests files in the target project; (2) read them; (3) for each test file we use a simple prompt to convert it into a harness. At this stage, it already out-performs on some existing projects, e.g: https://github.com/jkuhlmann/cgltf/blob/master/test/main.c In this case, we have a harness generated that looks quite nice: ```c // Heuristic: TestConverterPrompt :: Target: #include #include #include #include #define CGLTF_IMPLEMENTATION #include "cgltf.h" extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { if (size < 1) { return 0; } cgltf_options options; memset(&options, 0, sizeof(cgltf_options)); cgltf_data* parsed_data = NULL; cgltf_result result; // Parse input data result = cgltf_parse(&options, data, size, &parsed_data); if (result == cgltf_result_success) { result = cgltf_validate(parsed_data); } if (result == cgltf_result_success) { // Use the parsed data in some way // For example, print file type and mesh count printf("Type: %u\n", parsed_data->file_type); printf("Meshes: %u\n", (unsigned)parsed_data->meshes_count); } cgltf_free(parsed_data); return 0; } ``` Ref: https://github.com/google/oss-fuzz-gen/issues/494 --------- Signed-off-by: David Korczynski --- experimental/c-cpp/manager.py | 160 ++++++++++++++++++++++++++++++++-- 1 file changed, 155 insertions(+), 5 deletions(-) diff --git a/experimental/c-cpp/manager.py b/experimental/c-cpp/manager.py index 62f922d18d..3c41311ad0 100644 --- a/experimental/c-cpp/manager.py +++ b/experimental/c-cpp/manager.py @@ -52,6 +52,14 @@ def setup_model(model: str): LLM_MODEL = model +class Test: + """Holder of data about tests used by a repository.""" + + def __init__(self, test_path, test_content): + self.test_path = test_path + self.test_content = test_content + + class AutogeneratedHarness: """Represents a generated harness and holds corresponding artifacts.""" @@ -79,6 +87,7 @@ def dump_build_and_harness(self, fuzzer_gen_dir: str) -> None: # Write so they can be build using `compile` with open(self.harness_path, 'w') as f: f.write(self.source_code) + f.write("\n//david") with open('/src/build.sh', 'w') as f: f.write(self.build_script) @@ -169,9 +178,15 @@ def __init__(self, introspector_report: Dict[str, Any], self.introspector_report = introspector_report self.github_url = '' - @abstractmethod - def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]: - """generates fuzzer source code, build and include directives.""" + def get_fuzzer_intrinsics(self, func) -> Dict[str, Any]: # pylint: disable=unused-argument + """Generates fuzzer source code, build and include directives.""" + # By default return empty dictionary. + return {} + + def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]: # pylint: disable=unused-argument + """Generates fuzzer source code, build and include directives.""" + # By default return empty dictionary. + return {} @abstractmethod def get_fuzzing_targets(self) -> List[Any]: @@ -398,6 +413,80 @@ def get_fuzzer_intrinsics(self, func: Dict[str, Any]) -> Dict[str, Any]: return fuzzer_intrinsics +class FuzzerGenHeuristicTestConverter(FuzzHeuristicGeneratorBase): + """Heuristic that provides context around target function.""" + language = 'c' + name = 'TestConverterPrompt' + + def __init__(self, introspector_report: Dict[str, Any], + all_header_files: List[str], test_dir: str): + super().__init__(introspector_report, all_header_files, test_dir) + self.introspector_report = introspector_report + self.all_header_files = all_header_files + self.github_url = '' + + def get_fuzzing_targets(self) -> List: + return [] + + def get_fuzzer_test_intrinsics(self, test_case: Test) -> Dict[str, Any]: + """Returns the fuzzer intrinsics based on test conversion.""" + (headers_to_include, _, + build_command_includes) = self.get_header_intrinsics() + + # Include any weird macros defined that does not have any values. This + # was found empirically to be valuable. + macros_defined_in_test = [] + for line in test_case.test_content.split('\n'): + if '#define' in line and len(line.split(' ')) == 2: + macros_defined_in_test.append(line) + + logger.info('Sample targets:') + prompt = f'''I'm a security engineer looking to convert unit tests into +fuzzing harnesses. + +The goal is to convert the following unit test into a fuzzing harness: + +```c +TEST_SOURCE_CODE +``` + +The target library is {self.github_url}. + +Please write a fuzzing harness that is inspired by this unittest. You shuold write the fuzzing harness in +a libFuzzer-stlye structure. This means the harness should use `int LLMVFuzzerTestOneInput`. + +Any macros defined in the test should also be included in the fuzz harness. + +There is one rule that your harness must satisfy: all of the header files in this library is {str(headers_to_include)}. Make sure to not include any header files not in this list. + +In your response, include *only* the code for the harness, nothing more. You should wrap the code in tags. +''' + prompt = prompt.replace('TEST_SOURCE_CODE', test_case.test_content) + self.log_prompt(prompt) + + fuzzer_source = self.run_prompt_and_get_fuzzer_source(prompt) + comment_on_target = f'// Heuristic: {self.name} :: Target: \n' + + total_fuzzer_source = comment_on_target + # Add any macros not already in the harness + for macro in macros_defined_in_test: + macro_name = macro.split(' ')[1] + if macro_name not in fuzzer_source: + total_fuzzer_source += macro + '\n' + + total_fuzzer_source += FUZZER_PRE_HEADERS + total_fuzzer_source += fuzzer_source + + fuzzer_intrinsics = { + 'full-source-code': total_fuzzer_source, + 'build-command-includes': build_command_includes, + 'autogen-id': f'{self.name}-{test_case.test_path}', + 'prompt': prompt + } + + return fuzzer_intrinsics + + class FuzzerGenHeuristic5(FuzzHeuristicGeneratorBase): """Heuristic that provides context around target function.""" language = 'c' @@ -923,6 +1012,37 @@ def log_fuzzer_source(full_fuzzer_source: str): logger.info(harness_source_out) +def get_tests_converted_to_harnesses(build_results, language, test_dir, + fuzzer_build_cmd, all_header_files, + all_test_scripts, github_url): + """Converts a list of test files into fuzzing harnesses.""" + return_list = [] + + _, _, fuzzer_target_file, _ = get_language_defaults(language) + + for test in all_test_scripts: + + fuzz_converter = FuzzerGenHeuristicTestConverter({}, all_header_files, + test_dir) + fuzz_converter.github_url = github_url + fuzzer_intrinsics = fuzz_converter.get_fuzzer_test_intrinsics(test) + + # Generate a build script for compiling the fuzzer with ASAN. + final_asan_build_script = build_results[test_dir].build_script + '\n' + fuzzer_out = '/src/generated-fuzzer' + fuzz_cmd = ' '.join(fuzzer_build_cmd) + fuzz_includes = fuzzer_intrinsics['build-command-includes'] + final_asan_build_script += f'{fuzz_cmd} {fuzz_includes} -o {fuzzer_out}' + + return_list.append( + AutogeneratedHarness(final_asan_build_script, + fuzzer_intrinsics['full-source-code'], + fuzzer_target_file, fuzzer_out, fuzzer_intrinsics, + language)) + + return return_list + + def generate_harness_intrinsics( heuristic: FuzzHeuristicGeneratorBase, results, @@ -1256,6 +1376,26 @@ def get_heuristics_to_use() -> List[Type[FuzzHeuristicGeneratorBase]]: return heuristics_to_apply +def get_all_test_scripts(target_source_path) -> List[Test]: + """Returns a list of the test files in the target source path.""" + all_files = get_all_files_in_path(target_source_path) + all_tests = [] + for file in all_files: + split_path = file.split('/') + is_test = any(['test' in path for path in split_path]) + if not is_test: + continue + test_extensions = ['.cc', '.cpp', '.cxx', '.c++', 'c'] + if not any(file.endswith(ext) for ext in test_extensions): + continue + # Let's say this is a test + logger.info('Found test: %s', file) + with open(file, 'r') as f: + file_content = f.read() + all_tests.append(Test(file, file_content)) + return all_tests + + def auto_generate(github_url, disable_testing_build_scripts=False, disable_fuzzgen=False, @@ -1324,6 +1464,9 @@ def auto_generate(github_url, folders_with_results = set() logger.info('Going through %d build results to generate fuzzers', len(build_results)) + + all_test_scripts = get_all_test_scripts(target_source_path) + for test_dir, build_worker in build_results.items(): logger.info('Checking build heuristic: %s', build_worker.build_suggestion.heuristic_id) @@ -1385,8 +1528,15 @@ def auto_generate(github_url, logger.info('Applying %s', heuristic.name) heuristic.github_url = github_url - harness_builds_to_validate = generate_harness_intrinsics( - heuristic, build_results, language, test_dir, fuzzer_build_cmd) + harness_builds_to_validate = [] + harness_builds_to_validate.extend( + generate_harness_intrinsics(heuristic, build_results, language, + test_dir, fuzzer_build_cmd)) + + harness_builds_to_validate.extend( + get_tests_converted_to_harnesses(build_results, language, test_dir, + fuzzer_build_cmd, all_header_files, + all_test_scripts, github_url)) # Build the fuzzer for each project logger.info('Fuzzer harnesses to evaluate: %d',