From cb16e7890b188f4e3683aa18f07b9d142466b8e4 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Sun, 5 Oct 2025 16:41:41 +0300 Subject: [PATCH 1/9] feat: add 'manifest_path' and 'ecosystem' fields to input payload Signed-off-by: Zvi Grinberg  Conflicts:  src/vuln_analysis/functions/cve_generate_vdbs.py  src/vuln_analysis/tools/transitive_code_search.py  src/vuln_analysis/utils/transitive_code_searcher_tool.py --- .../functions/cve_generate_vdbs.py | 20 ++++++----- .../tools/transitive_code_search.py | 25 ++++++------- src/vuln_analysis/utils/document_embedding.py | 9 +++-- .../utils/source_code_git_loader.py | 21 ++++++----- .../utils/transitive_code_searcher_tool.py | 35 +++++++++++++++---- 5 files changed, 73 insertions(+), 37 deletions(-) diff --git a/src/vuln_analysis/functions/cve_generate_vdbs.py b/src/vuln_analysis/functions/cve_generate_vdbs.py index 54aae31a..a7449384 100644 --- a/src/vuln_analysis/functions/cve_generate_vdbs.py +++ b/src/vuln_analysis/functions/cve_generate_vdbs.py @@ -70,7 +70,7 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder): from vuln_analysis.utils.source_rpm_downloader import RPMDependencyManager from vuln_analysis.data_models.input import ManualSBOMInfoInput from vuln_analysis.utils.standard_library_cache import StandardLibraryCache - + agent_config = builder.get_function_config(config.agent_name) assert isinstance(agent_config, CVEAgentExecutorToolConfig) @@ -85,8 +85,8 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder): config.ignore_code_index = True embedding = await builder.get_embedder(embedder_name=config.embedder_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) - - # Configure RPM singleton with cache directory from config + + # Configure RPM singleton with cache directory from config rpm_manager = RPMDependencyManager.get_instance() rpm_manager.set_rpm_cache_dir(config.base_rpm_dir) cache_std = StandardLibraryCache.get_instance() @@ -135,7 +135,7 @@ def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: Docume logger.info("Completed code indexing in %.2f seconds for '%s'", time.time() - indexing_start_time, output_path) return True - def _build_code_index(source_infos: list[SourceDocumentsInfo]) -> Path | None: + def _build_code_index(source_infos: list[SourceDocumentsInfo], ecosystem, manifest_path) -> Path | None: code_index_path: Path | None = None # Filter to only code sources @@ -147,7 +147,9 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo]) -> Path | None: embedder = DocumentEmbedding(embedding=None, vdb_directory=config.base_vdb_dir, git_directory=config.base_git_dir, - pickle_cache_directory=config.base_pickle_dir) + pickle_cache_directory=config.base_pickle_dir, + manifest_path=manifest_path, + ecosystem=ecosystem) # Determine code index path for either loading from cache or creating new index # Need to add support for configurable base path @@ -188,6 +190,8 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: base_image = message.image.name source_infos = message.image.source_info sbom_infos = message.image.sbom_info + ecosystem = message.image.ecosystem + manifest_path = message.image.manifest_path try: trace_id.set(message.scan.id) @@ -208,13 +212,13 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput: # Build code index if not ignored if not config.ignore_code_index: - logger.info("analysis type: %s", message.image.analysis_type) + logger.info("analysis type: %s", message.image.analysis_type) if message.image.analysis_type == AnalysisType.IMAGE and isinstance(sbom_infos, ManualSBOMInfoInput): RPMDependencyManager.get_instance().sbom = sbom_infos.packages image = f"{message.image.name}:{message.image.tag}" RPMDependencyManager.get_instance().container_image = image - - code_index_path = _build_code_index(source_infos) + + code_index_path = _build_code_index(source_infos, ecosystem, manifest_path) if code_index_path is None: logger.warning(("Failed to generate code index for image '%s'. " diff --git a/src/vuln_analysis/tools/transitive_code_search.py b/src/vuln_analysis/tools/transitive_code_search.py index 0a21c409..8f92c59a 100644 --- a/src/vuln_analysis/tools/transitive_code_search.py +++ b/src/vuln_analysis/tools/transitive_code_search.py @@ -43,7 +43,7 @@ logger = LoggingFactory.get_agent_logger(__name__) -class TransitiveCodeSearchToolConfig(FunctionBaseConfig, name=("%s" % TRANSITIVE_CODE_SEARCH_TOOL_NAME)): +class TransitiveCodeSearchToolConfig(FunctionBaseConfig, name=TRANSITIVE_CODE_SEARCH_TOOL_NAME): """ Transitive code search tool used to search source code. """ @@ -55,13 +55,13 @@ class CallingFunctionNameExtractorToolConfig(FunctionBaseConfig, name=FUNCTION_N """ -class PackageAndFunctionLocatorToolConfig(FunctionBaseConfig, name=("%s" % PACKAGE_AND_FUNCTION_LOCATOR_TOOL_NAME)): +class PackageAndFunctionLocatorToolConfig(FunctionBaseConfig, name=PACKAGE_AND_FUNCTION_LOCATOR_TOOL_NAME): """ Package and function locator tool used to validate package names and find function names using fuzzy matching. """ -def get_call_of_chains_retriever(documents_embedder, si): +def get_call_of_chains_retriever(documents_embedder, si, ecosystem): documents: list[Document] git_repo = None for source_info in si: @@ -70,8 +70,10 @@ def get_call_of_chains_retriever(documents_embedder, si): documents = documents_embedder.collect_documents(source_info) if git_repo is None: raise ValueError("No code source info found") - with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file: - ecosystem = file.read() + if not ecosystem: + with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file: + ecosystem = file.read() + ecosystem = Ecosystem[ecosystem] coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem, manifest_path=git_repo) return coc_retriever @@ -82,7 +84,7 @@ def get_transitive_code_searcher(): if state.transitive_code_searcher is None: si = state.original_input.input.image.source_info documents_embedder = DocumentEmbedding(embedding=None) - coc_retriever = get_call_of_chains_retriever(documents_embedder, si) + coc_retriever = get_call_of_chains_retriever(documents_embedder, si, state.original_input.input.image.ecosystem) transitive_code_searcher = TransitiveCodeSearcher(chain_of_calls_retriever=coc_retriever) state.transitive_code_searcher = transitive_code_searcher return state.transitive_code_searcher @@ -179,7 +181,7 @@ async def _arun(query: str) -> list: @register_function(config_type=PackageAndFunctionLocatorToolConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN]) async def package_and_function_locator(config: PackageAndFunctionLocatorToolConfig, - builder: Builder): # pylint: disable=unused-argument + builder: Builder): # pylint: disable=unused-argument @catch_tool_errors(PACKAGE_AND_FUNCTION_LOCATOR_TOOL_NAME) async def _arun(query: str) -> dict: @@ -190,13 +192,12 @@ async def _arun(query: str) -> dict: locator = FunctionNameLocator(coc_retriever) result = await locator.locate_functions(query) pkg_msg = "Package is valid." - if not locator.is_package_valid and not locator.is_std_package: - pkg_msg = "Package is not valid." - - + if not locator.is_package_valid and not locator.is_std_package: + pkg_msg = "Package is not valid." + return { "ecosystem": coc_retriever.ecosystem.name, - "package_msg": pkg_msg, + "package_msg": pkg_msg, "result": result } diff --git a/src/vuln_analysis/utils/document_embedding.py b/src/vuln_analysis/utils/document_embedding.py index e01a600a..c3ac813f 100644 --- a/src/vuln_analysis/utils/document_embedding.py +++ b/src/vuln_analysis/utils/document_embedding.py @@ -37,6 +37,7 @@ from langchain_core.document_loaders.blob_loaders import Blob from vuln_analysis.data_models.input import SourceDocumentsInfo +from vuln_analysis.utils.dep_tree import Ecosystem from vuln_analysis.utils.go_segmenters_with_methods import GoSegmenterWithMethods from vuln_analysis.utils.python_segmenters_with_classes_methods import PythonSegmenterWithClassesMethods from vuln_analysis.utils.js_extended_parser import ExtendedJavaScriptSegmenter @@ -258,7 +259,7 @@ class DocumentEmbedding: def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = "./.cache/am_cache/vdb", git_directory: PathLike = "./.cache/am_cache/git", chunk_size: int = 800, chunk_overlap: int = 160, - pickle_cache_directory: PathLike = "./.cache/am_cache/pickle"): + pickle_cache_directory: PathLike = "./.cache/am_cache/pickle", ecosystem: Ecosystem, manifest_path: str): """ Create a new DocumentEmbedding instance. @@ -284,6 +285,8 @@ def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = "./.cac self._chunk_size = chunk_size self._chunk_overlap = chunk_overlap self._pickle_cache_directory = Path(pickle_cache_directory) + self._ecosystem = ecosystem + self._manifest_path = manifest_path @property def embedding(self): @@ -397,7 +400,9 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]: clone_url=source_info.git_repo, ref=source_info.ref, include=source_info.include, - exclude=source_info.exclude) + exclude=source_info.exclude, + manifest_path=self._manifest_path, + ecosystem=self._ecosystem) blob_parser = ExtendedLanguageParser() loader = GenericLoader(blob_loader=blob_loader, blob_parser=blob_parser) diff --git a/src/vuln_analysis/utils/source_code_git_loader.py b/src/vuln_analysis/utils/source_code_git_loader.py index ae2b089d..3bf50750 100644 --- a/src/vuln_analysis/utils/source_code_git_loader.py +++ b/src/vuln_analysis/utils/source_code_git_loader.py @@ -25,6 +25,7 @@ from langchain_core.document_loaders.blob_loaders import Blob from tqdm import tqdm +from vuln_analysis.utils.dep_tree import Ecosystem from vuln_analysis.utils.transitive_code_searcher_tool import TransitiveCodeSearcher from vuln_analysis.logging.loggers_factory import LoggingFactory @@ -47,14 +48,11 @@ class SourceCodeGitLoader(BlobLoader): files from. By default, it loads from the `main` branch. """ - def __init__( - self, - repo_path: PathLike, - clone_url: str | None = None, - ref: typing.Optional[str] = "main", - include: typing.Optional[typing.Iterable[str]] = None, - exclude: typing.Optional[typing.Iterable[str]] = None, - ): + def __init__(self, repo_path: PathLike, clone_url: str | None = None, ref: typing.Optional[str] = "main", + include: typing.Optional[typing.Iterable[str]] = None, + exclude: typing.Optional[typing.Iterable[str]] = None, + manifest_path: str = None, + ecosystem: Ecosystem = None): """ Initialize the Git loader. @@ -70,6 +68,8 @@ def __init__( A list of file patterns to include. Uses the glob syntax, by default None exclude : typing.Optional[typing.Iterable[str]], optional A list of file patterns to exclude. Uses the glob syntax, by default None + :param manifest_path: + :param ecosystem: """ self.repo_path = Path(repo_path) @@ -80,6 +80,8 @@ def __init__( self.exclude = exclude self._repo: Repo | None = None + self._manifest_path = manifest_path + self._ecosystem = ecosystem def load_repo(self): """ @@ -138,7 +140,8 @@ def load_repo(self): repo.git.checkout(self.ref, "--force") logger.info("Loaded Git repository at path: '%s' @ '%s'", self.repo_path, self.ref) - TransitiveCodeSearcher.download_dependencies(self.repo_path) + TransitiveCodeSearcher.download_dependencies(self.repo_path, manifest_path= self._manifest_path, + the_ecosystem=self._ecosystem) self._repo = repo return repo diff --git a/src/vuln_analysis/utils/transitive_code_searcher_tool.py b/src/vuln_analysis/utils/transitive_code_searcher_tool.py index bf65ac3d..c7384275 100644 --- a/src/vuln_analysis/utils/transitive_code_searcher_tool.py +++ b/src/vuln_analysis/utils/transitive_code_searcher_tool.py @@ -38,6 +38,13 @@ logger = LoggingFactory.get_agent_logger(f"morpheus.{__name__}") +def determine_manifest_name_by_ecosystem(the_ecosystem): + for manifest_name, ecosystem in MANIFESTS_TO_ECOSYSTEMS.items(): + if ecosystem == the_ecosystem: + return manifest_name + + return None + class TransitiveCodeSearcher: """ Transitive code Searcher for code using a Chain Of Calls Retriever object @@ -53,19 +60,33 @@ def __init__(self, chain_of_calls_retriever: ChainOfCallsRetriever): self.chain_of_calls_retriever = chain_of_calls_retriever @staticmethod - def download_dependencies(git_repo_path: Path) -> bool: + def download_dependencies(git_repo_path: Path, manifest_path: str = None, the_ecosystem: Ecosystem = None) -> bool: """ Download all dependencies according to manifest file in the Git repository Parameters ---------- git_repo_path : Path Git repository path to fetch the application manifests from + manifest_path: str + path to manifest file within the Git repository Returns whether dependencies were downloaded or not. + :param the_ecosystem: + :param git_repo_path: + :param manifest_path: """ ecosystem: Ecosystem # Check the root dir of the repo for existence of manifests, the precedence of which manifest file t o check is # according to the order from top to bottom - if os.path.isfile(git_repo_path / GOLANG_MANIFEST): + path_to_manifest: Path + # If manifest path is supplied in input, override the default root repo dir as dir of manifest file with this value. + if manifest_path: + path_to_manifest = git_repo_path.joinpath(manifest_path) + else: + path_to_manifest = git_repo_path + # If ecosystem is supplied in input, then override default of first found ecosystem manifest in the repo. + if the_ecosystem and os.path.isfile(path_to_manifest / determine_manifest_name_by_ecosystem(the_ecosystem)): + ecosystem = the_ecosystem + elif os.path.isfile(path_to_manifest / GOLANG_MANIFEST): ecosystem = MANIFESTS_TO_ECOSYSTEMS[GOLANG_MANIFEST] elif os.path.isfile(git_repo_path / PYTHON_MANIFEST): ecosystem = MANIFESTS_TO_ECOSYSTEMS[PYTHON_MANIFEST] @@ -73,6 +94,7 @@ def download_dependencies(git_repo_path: Path) -> bool: ecosystem = MANIFESTS_TO_ECOSYSTEMS[JS_MANIFEST] elif os.path.isfile(git_repo_path / JAVA_MANIFEST): ecosystem = MANIFESTS_TO_ECOSYSTEMS[JAVA_MANIFEST] + # Search for C/C++ manifest else: # 1. Direct checks candidates = [ @@ -91,11 +113,12 @@ def download_dependencies(git_repo_path: Path) -> bool: return False try: - logger.info(f"Started installing packages for {ecosystem}") - tree_builder = get_dependency_tree_builder(ecosystem) + logger.info(f"Started installing packages for {ecosystem}") + tree_builder = get_dependency_tree_builder(ecosystem.value) tree_builder.install_dependencies(git_repo_path) - with open(os.path.join(git_repo_path, 'ecosystem_data.txt'), 'w') as file: - file.write(ecosystem.name) + if not the_ecosystem: + with open(os.path.join(git_repo_path, 'ecosystem_data.txt'), 'w') as file: + file.write(ecosystem.name) logger.info(f"Finished installing packages for {ecosystem}") return True except NotImplementedError as err: From 186a97c0de765d06f4049e6897ec57fac95fd2e7 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Sun, 19 Oct 2025 17:48:48 +0300 Subject: [PATCH 2/9] fix: add default blank values for new fields Signed-off-by: Zvi Grinberg --- src/vuln_analysis/utils/document_embedding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vuln_analysis/utils/document_embedding.py b/src/vuln_analysis/utils/document_embedding.py index c3ac813f..950b765e 100644 --- a/src/vuln_analysis/utils/document_embedding.py +++ b/src/vuln_analysis/utils/document_embedding.py @@ -259,7 +259,8 @@ class DocumentEmbedding: def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = "./.cache/am_cache/vdb", git_directory: PathLike = "./.cache/am_cache/git", chunk_size: int = 800, chunk_overlap: int = 160, - pickle_cache_directory: PathLike = "./.cache/am_cache/pickle", ecosystem: Ecosystem, manifest_path: str): + pickle_cache_directory: PathLike = "./.cache/am_cache/pickle", ecosystem: Ecosystem = None, + manifest_path: str = None): """ Create a new DocumentEmbedding instance. From ac054b30df4b00fe58d160085bf8546cc47f3ea0 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Sun, 19 Oct 2025 17:50:20 +0300 Subject: [PATCH 3/9] style: some minor changes Signed-off-by: Zvi Grinberg --- .../tools/tests/test_transitive_code_search.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/vuln_analysis/tools/tests/test_transitive_code_search.py b/src/vuln_analysis/tools/tests/test_transitive_code_search.py index 5622ed20..8207305e 100644 --- a/src/vuln_analysis/tools/tests/test_transitive_code_search.py +++ b/src/vuln_analysis/tools/tests/test_transitive_code_search.py @@ -130,6 +130,7 @@ async def get_transitive_code_runner_function(): async for function in transitive_code_search.gen: return function.single_fn + python_dependency_tree_mock_output = ( 'deptree==0.0.12 # deptree\n' ' importlib-metadata==8.7.0 # importlib-metadata\n' @@ -147,6 +148,7 @@ async def get_transitive_code_runner_function(): ' mock-package==1.1.1' ) + def mock_file_open(*args, **kwargs): file_path = args[0] if args else kwargs.get('file', '') mock_file = MagicMock() @@ -179,14 +181,16 @@ def mock_file_open(*args, **kwargs): "search_query": "werkzeug,formparser.MultiPartParser.parse", "expected_path_found": False, "expected_list_length": 0, - "mock_documents": [python_script_example, python_init_function_example, python_full_document_example, python_parse_function_example] + "mock_documents": [python_script_example, python_init_function_example, python_full_document_example, + python_parse_function_example] }, { "name": "python_3", "search_query": "mock_package,mock_function_in_use", "expected_path_found": True, "expected_list_length": 3, - "mock_documents": [python_script_example, python_init_function_example, python_full_document_example, python_parse_function_example, python_mock_function_in_use, python_mock_file] + "mock_documents": [python_script_example, python_init_function_example, python_full_document_example, + python_parse_function_example, python_mock_function_in_use, python_mock_file] } ]) @patch('vuln_analysis.utils.dep_tree.run_command', return_value=python_dependency_tree_mock_output) @@ -204,7 +208,7 @@ async def test_transitive_search_python_parameterized(mock_open, mock_run_comman ) with patch('vuln_analysis.utils.document_embedding.retrieve_from_cache', - return_value=(test_case["mock_documents"], True)): + return_value=(test_case["mock_documents"], True)): result = await transitive_code_search_runner_coroutine(test_case["search_query"]) (path_found, list_path) = result @@ -215,6 +219,7 @@ async def test_transitive_search_python_parameterized(mock_open, mock_run_comman assert path_found == test_case["expected_path_found"] assert len(list_path) == test_case["expected_list_length"] + @pytest.mark.asyncio async def test_python_transitive_search(): """Test that runs with a real repository""" @@ -234,7 +239,7 @@ async def test_python_transitive_search(): print(f"DEBUG: path_found = {path_found}") print(f"DEBUG: list_path = {list_path}") print(f"DEBUG: len(list_path) = {len(list_path)}") - assert path_found == True + assert path_found is True assert len(list_path) == 2 From 2a06ef9c905ce9878f0f0bb921baca4d0ce442e0 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Fri, 24 Oct 2025 18:52:15 +0300 Subject: [PATCH 4/9] fix: add some essential logging Signed-off-by: Zvi Grinberg --- src/vuln_analysis/utils/source_code_git_loader.py | 2 +- src/vuln_analysis/utils/transitive_code_searcher_tool.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/vuln_analysis/utils/source_code_git_loader.py b/src/vuln_analysis/utils/source_code_git_loader.py index 3bf50750..0fb677b2 100644 --- a/src/vuln_analysis/utils/source_code_git_loader.py +++ b/src/vuln_analysis/utils/source_code_git_loader.py @@ -134,7 +134,7 @@ def load_repo(self): repo.git.fetch("origin", self.ref, "--depth=1", "--force") tag_refspec = f"refs/tags/{self.ref}:refs/tags/{self.ref}" try: - repo.git.fetch("origin", tag_refspec, "--depth=1" , "--force") + repo.git.fetch("origin", tag_refspec, "--depth=1", "--force") except GitCommandError: pass repo.git.checkout(self.ref, "--force") diff --git a/src/vuln_analysis/utils/transitive_code_searcher_tool.py b/src/vuln_analysis/utils/transitive_code_searcher_tool.py index c7384275..bc8418b7 100644 --- a/src/vuln_analysis/utils/transitive_code_searcher_tool.py +++ b/src/vuln_analysis/utils/transitive_code_searcher_tool.py @@ -81,11 +81,15 @@ def download_dependencies(git_repo_path: Path, manifest_path: str = None, the_ec # If manifest path is supplied in input, override the default root repo dir as dir of manifest file with this value. if manifest_path: path_to_manifest = git_repo_path.joinpath(manifest_path) + logger.info(f"manifest_path field supplied in request payload, overriding default value of " + f"root directory of repository." + f" relative manifest_path value => {manifest_path}, path_to_manifest=>{path_to_manifest}") else: path_to_manifest = git_repo_path # If ecosystem is supplied in input, then override default of first found ecosystem manifest in the repo. if the_ecosystem and os.path.isfile(path_to_manifest / determine_manifest_name_by_ecosystem(the_ecosystem)): ecosystem = the_ecosystem + logger.info(f"Ecosystem field supplied in request payload, ecosystem value => {ecosystem}") elif os.path.isfile(path_to_manifest / GOLANG_MANIFEST): ecosystem = MANIFESTS_TO_ECOSYSTEMS[GOLANG_MANIFEST] elif os.path.isfile(git_repo_path / PYTHON_MANIFEST): From c16cc321f5cbe1f486e56ed93d985041f53e8682 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Sun, 26 Oct 2025 23:53:58 +0200 Subject: [PATCH 5/9] fix: few needed adjustments Signed-off-by: Zvi Grinberg --- src/vuln_analysis/functions/cve_agent.py | 20 +++++++++++++------ .../functions/cve_generate_vdbs.py | 6 ++++-- .../tools/transitive_code_search.py | 1 + .../lang_functions_parsers_factory.py | 5 ++--- 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/vuln_analysis/functions/cve_agent.py b/src/vuln_analysis/functions/cve_agent.py index f86833de..b0a02156 100644 --- a/src/vuln_analysis/functions/cve_agent.py +++ b/src/vuln_analysis/functions/cve_agent.py @@ -70,7 +70,16 @@ async def _create_agent(config: CVEAgentExecutorToolConfig, builder: Builder, tools = builder.get_tools(tool_names=config.tool_names, wrapper_type=LLMFrameworkEnum.LANGCHAIN) llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN) prompt = PromptTemplate.from_template(get_agent_prompt(config.prompt, config.prompt_examples)) - + ecosystem = state.original_input.input.image.ecosystem + transitive_search_tool_supports_ecosystem: bool = True + # I + if ecosystem: + try: + get_language_function_parser(ecosystem) + except NotImplementedError: + transitive_search_tool_supports_ecosystem = False + logger.warning(f"Transitive code search tool doesn't support programming language {ecosystem}," + f" disabling tool...") # Filter tools that are not available tools = [ tool for tool in tools @@ -78,12 +87,11 @@ async def _create_agent(config: CVEAgentExecutorToolConfig, builder: Builder, (tool.name == "Container Image Developer Guide QA System" and state.doc_vdb_path is None) or (tool.name == "Lexical Search Container Image Code QA System" and state.code_index_path is None) or (tool.name == "Transitive code search tool" and (not config.transitive_search_tool_enabled or - state.code_index_path is None)) or + state.code_index_path is None or + not transitive_search_tool_supports_ecosystem)) or (tool.name == "Calling Function Name Extractor" and (not config.transitive_search_tool_enabled or - state.code_index_path is None)) or - (tool.name == "Package and Function Locator" and (not config.transitive_search_tool_enabled or - state.code_index_path is None)) - + state.code_index_path is None or + not transitive_search_tool_supports_ecosystem)) ) ] diff --git a/src/vuln_analysis/functions/cve_generate_vdbs.py b/src/vuln_analysis/functions/cve_generate_vdbs.py index a7449384..1c8c366d 100644 --- a/src/vuln_analysis/functions/cve_generate_vdbs.py +++ b/src/vuln_analysis/functions/cve_generate_vdbs.py @@ -28,6 +28,7 @@ from vuln_analysis.data_models.common import AnalysisType from vuln_analysis.logging.loggers_factory import LoggingFactory, trace_id +from vuln_analysis.utils.dep_tree import Ecosystem logger = LoggingFactory.get_agent_logger(__name__) @@ -98,7 +99,7 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder): pickle_cache_directory=config.base_pickle_dir) def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: DocumentEmbedding, - output_path: Path) -> bool : + output_path: Path) -> bool: logger.info("Collecting documents from git repos. Source Infos: %s", json.dumps([x.model_dump(mode="json") for x in source_infos])) @@ -135,7 +136,8 @@ def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: Docume logger.info("Completed code indexing in %.2f seconds for '%s'", time.time() - indexing_start_time, output_path) return True - def _build_code_index(source_infos: list[SourceDocumentsInfo], ecosystem, manifest_path) -> Path | None: + def _build_code_index(source_infos: list[SourceDocumentsInfo], ecosystem: Ecosystem = None, + manifest_path: str = None) -> Path | None: code_index_path: Path | None = None # Filter to only code sources diff --git a/src/vuln_analysis/tools/transitive_code_search.py b/src/vuln_analysis/tools/transitive_code_search.py index 8f92c59a..36c860d1 100644 --- a/src/vuln_analysis/tools/transitive_code_search.py +++ b/src/vuln_analysis/tools/transitive_code_search.py @@ -68,6 +68,7 @@ def get_call_of_chains_retriever(documents_embedder, si, ecosystem): if source_info.type == "code": git_repo = documents_embedder.get_repo_path(source_info) documents = documents_embedder.collect_documents(source_info) + if git_repo is None: raise ValueError("No code source info found") if not ecosystem: diff --git a/src/vuln_analysis/utils/functions_parsers/lang_functions_parsers_factory.py b/src/vuln_analysis/utils/functions_parsers/lang_functions_parsers_factory.py index f6252bd2..0c912a28 100644 --- a/src/vuln_analysis/utils/functions_parsers/lang_functions_parsers_factory.py +++ b/src/vuln_analysis/utils/functions_parsers/lang_functions_parsers_factory.py @@ -11,8 +11,7 @@ def get_language_function_parser(ecosystem: Ecosystem, tree: DependencyTree | No :param ecosystem: the desired programming language parser. :param tree: the dependency tree for the ecosystem, can be None. :return: - The right language functions parser associated to the ecosystem, if not exists, return ABC parent that - doesn't do anything + The right language functions parser associated to the ecosystem, if not exists, throw an NotImplementedError """ if ecosystem == Ecosystem.GO: return GoLanguageFunctionsParser() @@ -24,4 +23,4 @@ def get_language_function_parser(ecosystem: Ecosystem, tree: DependencyTree | No parser_obj.init_CParser(tree) return parser_obj else: - return LanguageFunctionsParser() + raise NotImplementedError(f"Language functions parser for {ecosystem} not implemented.") From cfb7f75dbe5dd651d2196c50e2d59a0d77578c2f Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Sun, 26 Oct 2025 23:54:52 +0200 Subject: [PATCH 6/9] docs: sharpen the instructions for creating/update oauthclient Signed-off-by: Zvi Grinberg --- kustomize/README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kustomize/README.md b/kustomize/README.md index 6fdcc40d..fc3892fb 100644 --- a/kustomize/README.md +++ b/kustomize/README.md @@ -115,13 +115,16 @@ EOF 5. Create the `oauth-secret.env` file containing the `client-secret` and `openshift-domain` values required by the [ExploitIQ Client](./base/exploit_iq_client.yaml) configuration. -Replace `some-long-secret-used-by-the-oauth-client` with a more secure, unique secret +If openshift resource of kind `OAuthClient` named `exploit-iq-client` exists, just get the secret from there: +```shell +export OAUTH_CLIENT_SECRET=$(oc get oauthclient exploit-iq-client -o jsonpath='{..secret}') +``` +Otherwise, Replace `some-long-secret-used-by-the-oauth-client` with a more secure, unique secret of your own: ```shell export OAUTH_CLIENT_SECRET="some-long-secret-used-by-the-oauth-client" ``` - ```shell cat > base/oauth-secrets.env << EOF client-secret=$OAUTH_CLIENT_SECRET @@ -168,7 +171,7 @@ redirectURIs: - "http://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html" EOF ``` -Otherwise, just add your route to the existing `OAuthClient` CR object: +Otherwise ( if creating `OAuthClient` instance got error because it's already exists in the cluster) , just add your route to the existing `OAuthClient` CR object: ```shell export HTTPS_ROUTE=https://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html export HTTP_ROUTE=http://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html From bbd84d24ff4159ae915c5384f459b9ce7c90da58 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Tue, 28 Oct 2025 16:44:20 +0200 Subject: [PATCH 7/9] fix: indetation error Signed-off-by: Zvi Grinberg --- src/vuln_analysis/utils/transitive_code_searcher_tool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vuln_analysis/utils/transitive_code_searcher_tool.py b/src/vuln_analysis/utils/transitive_code_searcher_tool.py index bc8418b7..5cb1385e 100644 --- a/src/vuln_analysis/utils/transitive_code_searcher_tool.py +++ b/src/vuln_analysis/utils/transitive_code_searcher_tool.py @@ -121,8 +121,8 @@ def download_dependencies(git_repo_path: Path, manifest_path: str = None, the_ec tree_builder = get_dependency_tree_builder(ecosystem.value) tree_builder.install_dependencies(git_repo_path) if not the_ecosystem: - with open(os.path.join(git_repo_path, 'ecosystem_data.txt'), 'w') as file: - file.write(ecosystem.name) + with open(os.path.join(git_repo_path, 'ecosystem_data.txt'), 'w') as file: + file.write(ecosystem.name) logger.info(f"Finished installing packages for {ecosystem}") return True except NotImplementedError as err: From 47a1905c3c8e985f9305461f0cbc2af9b8ead26c Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Tue, 28 Oct 2025 16:53:23 +0200 Subject: [PATCH 8/9] chore: enlarge temp storage pvc size for buildah task Signed-off-by: Zvi Grinberg --- src/vuln_analysis/utils/transitive_code_searcher_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vuln_analysis/utils/transitive_code_searcher_tool.py b/src/vuln_analysis/utils/transitive_code_searcher_tool.py index 5cb1385e..8b73fbe1 100644 --- a/src/vuln_analysis/utils/transitive_code_searcher_tool.py +++ b/src/vuln_analysis/utils/transitive_code_searcher_tool.py @@ -118,7 +118,7 @@ def download_dependencies(git_repo_path: Path, manifest_path: str = None, the_ec try: logger.info(f"Started installing packages for {ecosystem}") - tree_builder = get_dependency_tree_builder(ecosystem.value) + tree_builder = get_dependency_tree_builder(ecosystem) tree_builder.install_dependencies(git_repo_path) if not the_ecosystem: with open(os.path.join(git_repo_path, 'ecosystem_data.txt'), 'w') as file: From ec09712c4973b5555f7cad4736d2a034928f4433 Mon Sep 17 00:00:00 2001 From: Zvi Grinberg Date: Thu, 30 Oct 2025 09:12:59 +0200 Subject: [PATCH 9/9] fix: few bugs fixes Signed-off-by: Zvi Grinberg --- .../tools/tests/test_transitive_code_search.py | 2 +- src/vuln_analysis/tools/transitive_code_search.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/vuln_analysis/tools/tests/test_transitive_code_search.py b/src/vuln_analysis/tools/tests/test_transitive_code_search.py index 8207305e..db3af798 100644 --- a/src/vuln_analysis/tools/tests/test_transitive_code_search.py +++ b/src/vuln_analysis/tools/tests/test_transitive_code_search.py @@ -304,4 +304,4 @@ async def test_c_transitive_search_2(): print(f"DEBUG: list_path = {list_path}") print(f"DEBUG: len(list_path) = {len(list_path)}") assert len(list_path) == 1 - assert path_found == False + assert path_found is False diff --git a/src/vuln_analysis/tools/transitive_code_search.py b/src/vuln_analysis/tools/transitive_code_search.py index 36c860d1..e2763900 100644 --- a/src/vuln_analysis/tools/transitive_code_search.py +++ b/src/vuln_analysis/tools/transitive_code_search.py @@ -61,7 +61,7 @@ class PackageAndFunctionLocatorToolConfig(FunctionBaseConfig, name=PACKAGE_AND_F """ -def get_call_of_chains_retriever(documents_embedder, si, ecosystem): +def get_call_of_chains_retriever(documents_embedder, si, ecosystem, manifest_path : str): documents: list[Document] git_repo = None for source_info in si: @@ -74,9 +74,14 @@ def get_call_of_chains_retriever(documents_embedder, si, ecosystem): if not ecosystem: with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file: ecosystem = file.read() - + # path_to_manifest = git_repo_path.joinpath(manifest_path) ecosystem = Ecosystem[ecosystem] - coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem, manifest_path=git_repo) + if manifest_path: + git_repo_with_manifest = git_repo.joinpath(manifest_path) + else: + git_repo_with_manifest = git_repo + coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem, + manifest_path=git_repo_with_manifest) return coc_retriever @@ -85,7 +90,8 @@ def get_transitive_code_searcher(): if state.transitive_code_searcher is None: si = state.original_input.input.image.source_info documents_embedder = DocumentEmbedding(embedding=None) - coc_retriever = get_call_of_chains_retriever(documents_embedder, si, state.original_input.input.image.ecosystem) + coc_retriever = get_call_of_chains_retriever(documents_embedder, si, state.original_input.input.image.ecosystem + , state.original_input.input.image.manifest_path) transitive_code_searcher = TransitiveCodeSearcher(chain_of_calls_retriever=coc_retriever) state.transitive_code_searcher = transitive_code_searcher return state.transitive_code_searcher