Skip to content
2 changes: 1 addition & 1 deletion .tekton/on-pull-request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
storage: 10Gi

- name: unit-test-cache
persistentVolumeClaim:
Expand Down
2 changes: 1 addition & 1 deletion .tekton/on-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ spec:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
storage: 10Gi
# This workspace will inject secret to help the git-clone task to be able to
# checkout the private repositories
- name: basic-auth
Expand Down
9 changes: 6 additions & 3 deletions kustomize/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,16 @@ EOF

5. Create the `oauth-secret.env` file containing the `client-secret` and `openshift-domain` values required by the [ExploitIQ Client](./base/exploit_iq_client.yaml) configuration.

Replace `some-long-secret-used-by-the-oauth-client` with a more secure, unique secret
If openshift resource of kind `OAuthClient` named `exploit-iq-client` exists, just get the secret from there:
```shell
export OAUTH_CLIENT_SECRET=$(oc get oauthclient exploit-iq-client -o jsonpath='{..secret}')
```
Otherwise, Replace `some-long-secret-used-by-the-oauth-client` with a more secure, unique secret of your own:

```shell
export OAUTH_CLIENT_SECRET="some-long-secret-used-by-the-oauth-client"
```


```shell
cat > base/oauth-secrets.env << EOF
client-secret=$OAUTH_CLIENT_SECRET
Expand Down Expand Up @@ -168,7 +171,7 @@ redirectURIs:
- "http://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html"
EOF
```
Otherwise, just add your route to the existing `OAuthClient` CR object:
Otherwise ( if creating `OAuthClient` instance got error because it's already exists in the cluster) , just add your route to the existing `OAuthClient` CR object:
```shell
export HTTPS_ROUTE=https://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html
export HTTP_ROUTE=http://$(oc get route exploit-iq-client -o jsonpath='{.spec.host}')/app/index.html
Expand Down
20 changes: 14 additions & 6 deletions src/vuln_analysis/functions/cve_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,28 @@ async def _create_agent(config: CVEAgentExecutorToolConfig, builder: Builder,
tools = builder.get_tools(tool_names=config.tool_names, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
llm = await builder.get_llm(llm_name=config.llm_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
prompt = PromptTemplate.from_template(get_agent_prompt(config.prompt, config.prompt_examples))

ecosystem = state.original_input.input.image.ecosystem
transitive_search_tool_supports_ecosystem: bool = True
# I
if ecosystem:
try:
get_language_function_parser(ecosystem)
except NotImplementedError:
transitive_search_tool_supports_ecosystem = False
logger.warning(f"Transitive code search tool doesn't support programming language {ecosystem},"
f" disabling tool...")
# Filter tools that are not available
tools = [
tool for tool in tools
if not ((tool.name == "Container Image Code QA System" and state.code_vdb_path is None) or
(tool.name == "Container Image Developer Guide QA System" and state.doc_vdb_path is None) or
(tool.name == "Lexical Search Container Image Code QA System" and state.code_index_path is None) or
(tool.name == "Transitive code search tool" and (not config.transitive_search_tool_enabled or
state.code_index_path is None)) or
state.code_index_path is None or
not transitive_search_tool_supports_ecosystem)) or
(tool.name == "Calling Function Name Extractor" and (not config.transitive_search_tool_enabled or
state.code_index_path is None)) or
(tool.name == "Package and Function Locator" and (not config.transitive_search_tool_enabled or
state.code_index_path is None))

state.code_index_path is None or
not transitive_search_tool_supports_ecosystem))
)
]

Expand Down
24 changes: 15 additions & 9 deletions src/vuln_analysis/functions/cve_generate_vdbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from vuln_analysis.data_models.common import AnalysisType
from vuln_analysis.logging.loggers_factory import LoggingFactory, trace_id
from vuln_analysis.utils.dep_tree import Ecosystem

logger = LoggingFactory.get_agent_logger(__name__)

Expand Down Expand Up @@ -70,7 +71,7 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder):
from vuln_analysis.utils.source_rpm_downloader import RPMDependencyManager
from vuln_analysis.data_models.input import ManualSBOMInfoInput
from vuln_analysis.utils.standard_library_cache import StandardLibraryCache

agent_config = builder.get_function_config(config.agent_name)
assert isinstance(agent_config, CVEAgentExecutorToolConfig)

Expand All @@ -85,8 +86,8 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder):
config.ignore_code_index = True

embedding = await builder.get_embedder(embedder_name=config.embedder_name, wrapper_type=LLMFrameworkEnum.LANGCHAIN)
# Configure RPM singleton with cache directory from config

# Configure RPM singleton with cache directory from config
rpm_manager = RPMDependencyManager.get_instance()
rpm_manager.set_rpm_cache_dir(config.base_rpm_dir)
cache_std = StandardLibraryCache.get_instance()
Expand All @@ -98,7 +99,7 @@ async def generate_vdb(config: CVEGenerateVDBsToolConfig, builder: Builder):
pickle_cache_directory=config.base_pickle_dir)

def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: DocumentEmbedding,
output_path: Path) -> bool :
output_path: Path) -> bool:
logger.info("Collecting documents from git repos. Source Infos: %s",
json.dumps([x.model_dump(mode="json") for x in source_infos]))

Expand Down Expand Up @@ -135,7 +136,8 @@ def _create_code_index(source_infos: list[SourceDocumentsInfo], embedder: Docume
logger.info("Completed code indexing in %.2f seconds for '%s'", time.time() - indexing_start_time, output_path)
return True

def _build_code_index(source_infos: list[SourceDocumentsInfo]) -> Path | None:
def _build_code_index(source_infos: list[SourceDocumentsInfo], ecosystem: Ecosystem = None,
manifest_path: str = None) -> Path | None:
code_index_path: Path | None = None

# Filter to only code sources
Expand All @@ -147,7 +149,9 @@ def _build_code_index(source_infos: list[SourceDocumentsInfo]) -> Path | None:
embedder = DocumentEmbedding(embedding=None,
vdb_directory=config.base_vdb_dir,
git_directory=config.base_git_dir,
pickle_cache_directory=config.base_pickle_dir)
pickle_cache_directory=config.base_pickle_dir,
manifest_path=manifest_path,
ecosystem=ecosystem)

# Determine code index path for either loading from cache or creating new index
# Need to add support for configurable base path
Expand Down Expand Up @@ -188,6 +192,8 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput:
base_image = message.image.name
source_infos = message.image.source_info
sbom_infos = message.image.sbom_info
ecosystem = message.image.ecosystem
manifest_path = message.image.manifest_path

try:
trace_id.set(message.scan.id)
Expand All @@ -208,13 +214,13 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput:

# Build code index if not ignored
if not config.ignore_code_index:
logger.info("analysis type: %s", message.image.analysis_type)
logger.info("analysis type: %s", message.image.analysis_type)
if message.image.analysis_type == AnalysisType.IMAGE and isinstance(sbom_infos, ManualSBOMInfoInput):
RPMDependencyManager.get_instance().sbom = sbom_infos.packages
image = f"{message.image.name}:{message.image.tag}"
RPMDependencyManager.get_instance().container_image = image
code_index_path = _build_code_index(source_infos)

code_index_path = _build_code_index(source_infos, ecosystem, manifest_path)

if code_index_path is None:
logger.warning(("Failed to generate code index for image '%s'. "
Expand Down
15 changes: 10 additions & 5 deletions src/vuln_analysis/tools/tests/test_transitive_code_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ async def get_transitive_code_runner_function():
async for function in transitive_code_search.gen:
return function.single_fn


python_dependency_tree_mock_output = (
'deptree==0.0.12 # deptree\n'
' importlib-metadata==8.7.0 # importlib-metadata\n'
Expand All @@ -147,6 +148,7 @@ async def get_transitive_code_runner_function():
' mock-package==1.1.1'
)


def mock_file_open(*args, **kwargs):
file_path = args[0] if args else kwargs.get('file', '')
mock_file = MagicMock()
Expand Down Expand Up @@ -179,14 +181,16 @@ def mock_file_open(*args, **kwargs):
"search_query": "werkzeug,formparser.MultiPartParser.parse",
"expected_path_found": False,
"expected_list_length": 0,
"mock_documents": [python_script_example, python_init_function_example, python_full_document_example, python_parse_function_example]
"mock_documents": [python_script_example, python_init_function_example, python_full_document_example,
python_parse_function_example]
},
{
"name": "python_3",
"search_query": "mock_package,mock_function_in_use",
"expected_path_found": True,
"expected_list_length": 3,
"mock_documents": [python_script_example, python_init_function_example, python_full_document_example, python_parse_function_example, python_mock_function_in_use, python_mock_file]
"mock_documents": [python_script_example, python_init_function_example, python_full_document_example,
python_parse_function_example, python_mock_function_in_use, python_mock_file]
}
])
@patch('vuln_analysis.utils.dep_tree.run_command', return_value=python_dependency_tree_mock_output)
Expand All @@ -204,7 +208,7 @@ async def test_transitive_search_python_parameterized(mock_open, mock_run_comman
)

with patch('vuln_analysis.utils.document_embedding.retrieve_from_cache',
return_value=(test_case["mock_documents"], True)):
return_value=(test_case["mock_documents"], True)):
result = await transitive_code_search_runner_coroutine(test_case["search_query"])

(path_found, list_path) = result
Expand All @@ -215,6 +219,7 @@ async def test_transitive_search_python_parameterized(mock_open, mock_run_comman
assert path_found == test_case["expected_path_found"]
assert len(list_path) == test_case["expected_list_length"]


@pytest.mark.asyncio
async def test_python_transitive_search():
"""Test that runs with a real repository"""
Expand All @@ -234,7 +239,7 @@ async def test_python_transitive_search():
print(f"DEBUG: path_found = {path_found}")
print(f"DEBUG: list_path = {list_path}")
print(f"DEBUG: len(list_path) = {len(list_path)}")
assert path_found == True
assert path_found is True
assert len(list_path) == 2


Expand Down Expand Up @@ -299,4 +304,4 @@ async def test_c_transitive_search_2():
print(f"DEBUG: list_path = {list_path}")
print(f"DEBUG: len(list_path) = {len(list_path)}")
assert len(list_path) == 1
assert path_found == False
assert path_found is False
31 changes: 20 additions & 11 deletions src/vuln_analysis/tools/transitive_code_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from ..utils.function_name_locator import FunctionNameLocator

from vuln_analysis.logging.loggers_factory import LoggingFactory

logger = LoggingFactory.get_agent_logger(__name__)


Expand All @@ -54,19 +55,27 @@ class PackageAndFunctionLocatorToolConfig(FunctionBaseConfig, name="package_and_
"""


def get_call_of_chains_retriever(documents_embedder, si):
def get_call_of_chains_retriever(documents_embedder, si, ecosystem, manifest_path : str):
documents: list[Document]
git_repo = None
for source_info in si:
if source_info.type == "code":
git_repo = documents_embedder.get_repo_path(source_info)
documents = documents_embedder.collect_documents(source_info)

if git_repo is None:
raise ValueError("No code source info found")
with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file:
ecosystem = file.read()
if not ecosystem:
with open(os.path.join(git_repo, 'ecosystem_data.txt'), 'r', encoding='utf-8') as file:
ecosystem = file.read()
# path_to_manifest = git_repo_path.joinpath(manifest_path)
ecosystem = Ecosystem[ecosystem]
coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem, manifest_path=git_repo)
if manifest_path:
git_repo_with_manifest = git_repo.joinpath(manifest_path)
else:
git_repo_with_manifest = git_repo
coc_retriever = ChainOfCallsRetriever(documents=documents, ecosystem=ecosystem,
manifest_path=git_repo_with_manifest)
return coc_retriever


Expand All @@ -75,7 +84,8 @@ def get_transitive_code_searcher():
if state.transitive_code_searcher is None:
si = state.original_input.input.image.source_info
documents_embedder = DocumentEmbedding(embedding=None)
coc_retriever = get_call_of_chains_retriever(documents_embedder, si)
coc_retriever = get_call_of_chains_retriever(documents_embedder, si, state.original_input.input.image.ecosystem
, state.original_input.input.image.manifest_path)
transitive_code_searcher = TransitiveCodeSearcher(chain_of_calls_retriever=coc_retriever)
state.transitive_code_searcher = transitive_code_searcher
return state.transitive_code_searcher
Expand Down Expand Up @@ -172,7 +182,7 @@ async def _arun(query: str) -> list:

@register_function(config_type=PackageAndFunctionLocatorToolConfig, framework_wrappers=[LLMFrameworkEnum.LANGCHAIN])
async def package_and_function_locator(config: PackageAndFunctionLocatorToolConfig,
builder: Builder): # pylint: disable=unused-argument
builder: Builder): # pylint: disable=unused-argument

async def _arun(query: str) -> dict:
coc_retriever: ChainOfCallsRetriever
Expand All @@ -182,13 +192,12 @@ async def _arun(query: str) -> dict:
locator = FunctionNameLocator(coc_retriever)
result = await locator.locate_functions(query)
pkg_msg = "Package is valid."
if not locator.is_package_valid and not locator.is_std_package:
pkg_msg = "Package is not valid."


if not locator.is_package_valid and not locator.is_std_package:
pkg_msg = "Package is not valid."

return {
"ecosystem": coc_retriever.ecosystem.name,
"package_msg": pkg_msg,
"package_msg": pkg_msg,
"result": result
}

Expand Down
10 changes: 8 additions & 2 deletions src/vuln_analysis/utils/document_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from langchain_core.document_loaders.blob_loaders import Blob

from vuln_analysis.data_models.input import SourceDocumentsInfo
from vuln_analysis.utils.dep_tree import Ecosystem
from vuln_analysis.utils.go_segmenters_with_methods import GoSegmenterWithMethods
from vuln_analysis.utils.python_segmenters_with_classes_methods import PythonSegmenterWithClassesMethods
from vuln_analysis.utils.js_extended_parser import ExtendedJavaScriptSegmenter
Expand Down Expand Up @@ -258,7 +259,8 @@ class DocumentEmbedding:

def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = "./.cache/am_cache/vdb",
git_directory: PathLike = "./.cache/am_cache/git", chunk_size: int = 800, chunk_overlap: int = 160,
pickle_cache_directory: PathLike = "./.cache/am_cache/pickle"):
pickle_cache_directory: PathLike = "./.cache/am_cache/pickle", ecosystem: Ecosystem = None,
manifest_path: str = None):
"""
Create a new DocumentEmbedding instance.

Expand All @@ -284,6 +286,8 @@ def __init__(self, *, embedding: "Embeddings", vdb_directory: PathLike = "./.cac
self._chunk_size = chunk_size
self._chunk_overlap = chunk_overlap
self._pickle_cache_directory = Path(pickle_cache_directory)
self._ecosystem = ecosystem
self._manifest_path = manifest_path

@property
def embedding(self):
Expand Down Expand Up @@ -397,7 +401,9 @@ def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
clone_url=source_info.git_repo,
ref=source_info.ref,
include=source_info.include,
exclude=source_info.exclude)
exclude=source_info.exclude,
manifest_path=self._manifest_path,
ecosystem=self._ecosystem)
blob_parser = ExtendedLanguageParser()

loader = GenericLoader(blob_loader=blob_loader, blob_parser=blob_parser)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,8 +327,14 @@ def parse_all_type_struct_class_to_fields(self, types: list[Document]) -> dict[t
next_struct = current_line_stripped.find("struct")
if next_eol > - 1 and (next_struct == -1 or next_struct > next_eol):
if not self.is_comment_line(current_line_stripped[:next_eol + 1]):

declaration_parts = current_line_stripped[:next_eol + 1].split()
# If row inside block contains func, then it's a function type and need to parse it in a
# special way.
if current_line_stripped[:next_eol + 1].__contains__("func"):
declaration_parts = current_line_stripped[:next_eol + 1].split("func")
declaration_parts = [part.strip() for part in declaration_parts]
if len(declaration_parts) == 2:
declaration_parts[1] = f"func {declaration_parts[1]}"
# ignore alias' "equals" notation
if len(declaration_parts) == 3:
[name, _, type_name] = declaration_parts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ def get_language_function_parser(ecosystem: Ecosystem, tree: DependencyTree | No
:param ecosystem: the desired programming language parser.
:param tree: the dependency tree for the ecosystem, can be None.
:return:
The right language functions parser associated to the ecosystem, if not exists, return ABC parent that
doesn't do anything
The right language functions parser associated to the ecosystem, if not exists, throw an NotImplementedError
"""
if ecosystem == Ecosystem.GO:
return GoLanguageFunctionsParser()
Expand All @@ -24,4 +23,4 @@ def get_language_function_parser(ecosystem: Ecosystem, tree: DependencyTree | No
parser_obj.init_CParser(tree)
return parser_obj
else:
return LanguageFunctionsParser()
raise NotImplementedError(f"Language functions parser for {ecosystem} not implemented.")
Loading