Skip to content

Commit cf64dbb

Browse files
committed
fix: correct repo path handling and directory traversal
Signed-off-by: Vladimir Belousov <[email protected]>
1 parent cf9c699 commit cf64dbb

File tree

4 files changed

+31
-18
lines changed

4 files changed

+31
-18
lines changed

src/exploit_iq_commons/embedding/document_embedding.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
from exploit_iq_commons.embedding.go_segmenters_with_methods import GoSegmenterWithMethods
4141
from exploit_iq_commons.embedding.js_extended_parser import ExtendedJavaScriptSegmenter
4242
from exploit_iq_commons.embedding.source_code_git_loader import SourceCodeGitLoader
43+
from vuln_analysis.utils.git_utils import sanitize_git_url_for_path
4344
from exploit_iq_commons.embedding.transitive_code_searcher_tool import TransitiveCodeSearcher
4445
from exploit_iq_commons.logging.loggers_factory import LoggingFactory
4546

@@ -348,7 +349,11 @@ def get_repo_path(self, source_info: SourceDocumentsInfo):
348349
Path
349350
Returns the path to the git repository.
350351
"""
351-
return self._git_directory / PurePath(source_info.git_repo)
352+
# Sanitize the git repo URL to create a valid filesystem path
353+
# Remove protocol separators and path separators that could cause issues
354+
# Example: 'https://github.com/RHEcosystemAppEng/vulnerability-analysis' -> 'https.github.com.RHEcosystemAppEng.vulnerability-analysis'
355+
sanitized_repo_path = sanitize_git_url_for_path(source_info.git_repo)
356+
return self._git_directory / PurePath(sanitized_repo_path)
352357

353358
def collect_documents(self, source_info: SourceDocumentsInfo) -> list[Document]:
354359
"""

src/exploit_iq_commons/embedding/source_code_git_loader.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -179,18 +179,18 @@ def yield_blobs(self) -> typing.Iterator[Blob]:
179179
logger.info("Processing %d files in the Git repository at path: '%s'", len(final_files), self.repo_path)
180180

181181
for f in tqdm(final_files):
182-
183-
file_path = Path(f)
184-
185-
abs_file_path = base_path / file_path
186-
187-
rel_file_path = str(file_path)
188-
189-
metadata = {
190-
"source": rel_file_path,
191-
"file_path": rel_file_path,
192-
"file_name": file_path.name,
193-
"file_type": file_path.suffix,
194-
}
195-
196-
yield Blob.from_path(abs_file_path, metadata=metadata)
182+
abs_file_path = base_path / f
183+
if abs_file_path.is_file():
184+
try:
185+
rel_file_path = str(f)
186+
metadata = {
187+
"source": rel_file_path,
188+
"file_path": rel_file_path,
189+
"file_name": abs_file_path.name,
190+
"file_type": abs_file_path.suffix,
191+
}
192+
yield Blob.from_path(abs_file_path, metadata=metadata)
193+
except Exception as e:
194+
logger.warning("Failed to read blob for '%s'. Ignoring this file. Error: %s", abs_file_path, e)
195+
else:
196+
logger.debug("Skipping path as it is a directory, not a file: '%s'", abs_file_path)

src/vuln_analysis/functions/cve_generate_vdbs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,12 @@ async def _arun(message: AgentMorpheusInput) -> AgentMorpheusEngineInput:
202202
# Replace ref with specific commit hash for each source info
203203
for si in source_infos:
204204
try:
205-
repo = get_repo_from_path(config.base_git_dir, si.git_repo)
205+
# Get the sanitized path from the embedder instance
206+
repo_path = embedder.get_repo_path(si)
207+
repo = get_repo_from_path(str(repo_path.parent), repo_path.name)
206208
si.ref = repo.commit().hexsha
207209
except ValueError as e:
208-
logger.warning("Failed to get commit hash for %s/%s: %s", config.base_git_dir, si.git_repo, e)
210+
logger.warning("Failed to get commit hash for repo defined in %s: %s", si, e)
209211
continue
210212

211213
except Exception as e:

src/vuln_analysis/utils/git_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@
2121
from git import Repo
2222

2323

24+
def sanitize_git_url_for_path(git_url: str) -> str:
25+
"""Sanitizes a git repo URL to create a valid filesystem path component."""
26+
# Example: 'https://github.com/some/repo' -> 'https.github.com.some.repo'
27+
return git_url.replace('//', '.').replace('/', '.').replace(':', '')
28+
29+
2430
def get_repo_from_path(base_dir: str, git_repo: str = ".git") -> Repo:
2531
"""
2632
Utility function for getting GitPython `Repo` object representing a Git repository.

0 commit comments

Comments
 (0)