fix(downloads): Use random generated name for files (backport #540)#541
Conversation
|
| Filename | Overview |
|---|---|
| agent/utils.py | Replaces URL-derived filename with secrets.token_urlsafe(16) plus a hardcoded-known extension, eliminating path traversal via crafted URLs. |
| agent/site.py | Adds _safe_extract_tar to replace shell tar invocations; rejects symlinks, hardlinks, absolute paths, and parent-traversal. Minor: os.path.realpath(dest) is re-evaluated inside the loop. |
| agent/base.py | Deprecates string-form commands in run_subprocess (switches shell=True→False for list commands); raises TypeError when executable is passed with a list, which is a minor breaking change for any existing callers relying on that combination. |
Flowchart
%%{init: {'theme': 'neutral'}}%%
flowchart TD
A[download_file URL] --> B[Parse URL path basename]
B --> C{Known extension?}
C -- yes --> D[ext = matched suffix]
C -- no --> E[ext = empty string]
D --> F[filename = token_urlsafe + ext]
E --> F
F --> G[Save to prefix/filename]
G --> H[restore_files]
H --> I[_safe_extract_tar]
I --> J[tarfile.open path]
J --> K[For each member]
K --> L{issym or islnk?}
L -- yes --> M[raise ExtractError]
L -- no --> N{isabs name?}
N -- yes --> M
N -- no --> O{'.. in parts?}
O -- yes --> M
O -- no --> P{strip > 0?}
P -- yes --> Q[Strip N leading components]
Q --> R{empty after strip?}
R -- yes --> S[skip member]
R -- no --> T[Check target within dest_real]
P -- no --> T
T --> U{outside dest?}
U -- yes --> M
U -- no --> V[Add to valid list]
V --> W[tar.extractall valid members]
Prompt To Fix All With AI
Fix the following 1 code review issue. Work through them one at a time, proposing concise fixes.
---
### Issue 1 of 1
agent/site.py:147-168
`dest_real` is recomputed via `os.path.realpath` on every loop iteration. Since `dest` never changes within the loop, this should be moved above the loop to avoid repeated syscalls proportional to archive member count.
```suggestion
with tarfile.open(path) as tar:
members = tar.getmembers()
valid = []
dest_real = os.path.realpath(dest)
for member in members:
if member.issym() or member.islnk():
raise tarfile.ExtractError(f"Refusing to extract link: {member.name}")
if os.path.isabs(member.name):
raise tarfile.ExtractError(f"Refusing absolute path: {member.name}")
parts = member.name.split("/")
if ".." in parts:
raise tarfile.ExtractError(f"Refusing parent traversal: {member.name}")
if strip:
stripped = "/".join(parts[strip:])
if not stripped:
continue
member.name = stripped
target = os.path.realpath(os.path.join(dest, member.name))
if not target.startswith(dest_real + os.sep):
raise tarfile.ExtractError(f"Refusing path outside destination: {member.name}")
valid.append(member)
tar.extractall(path=dest, members=valid)
```
Reviews (1): Last reviewed commit: "fix(downloads): Only extract valid paths" | Re-trigger Greptile
| with tarfile.open(path) as tar: | ||
| members = tar.getmembers() | ||
| valid = [] | ||
| for member in members: | ||
| if member.issym() or member.islnk(): | ||
| raise tarfile.ExtractError(f"Refusing to extract link: {member.name}") | ||
| if os.path.isabs(member.name): | ||
| raise tarfile.ExtractError(f"Refusing absolute path: {member.name}") | ||
| parts = member.name.split("/") | ||
| if ".." in parts: | ||
| raise tarfile.ExtractError(f"Refusing parent traversal: {member.name}") | ||
| if strip: | ||
| stripped = "/".join(parts[strip:]) | ||
| if not stripped: | ||
| continue | ||
| member.name = stripped | ||
| dest_real = os.path.realpath(dest) | ||
| target = os.path.realpath(os.path.join(dest, member.name)) | ||
| if not target.startswith(dest_real + os.sep): | ||
| raise tarfile.ExtractError(f"Refusing path outside destination: {member.name}") | ||
| valid.append(member) | ||
| tar.extractall(path=dest, members=valid) |
There was a problem hiding this comment.
dest_real is recomputed via os.path.realpath on every loop iteration. Since dest never changes within the loop, this should be moved above the loop to avoid repeated syscalls proportional to archive member count.
| with tarfile.open(path) as tar: | |
| members = tar.getmembers() | |
| valid = [] | |
| for member in members: | |
| if member.issym() or member.islnk(): | |
| raise tarfile.ExtractError(f"Refusing to extract link: {member.name}") | |
| if os.path.isabs(member.name): | |
| raise tarfile.ExtractError(f"Refusing absolute path: {member.name}") | |
| parts = member.name.split("/") | |
| if ".." in parts: | |
| raise tarfile.ExtractError(f"Refusing parent traversal: {member.name}") | |
| if strip: | |
| stripped = "/".join(parts[strip:]) | |
| if not stripped: | |
| continue | |
| member.name = stripped | |
| dest_real = os.path.realpath(dest) | |
| target = os.path.realpath(os.path.join(dest, member.name)) | |
| if not target.startswith(dest_real + os.sep): | |
| raise tarfile.ExtractError(f"Refusing path outside destination: {member.name}") | |
| valid.append(member) | |
| tar.extractall(path=dest, members=valid) | |
| with tarfile.open(path) as tar: | |
| members = tar.getmembers() | |
| valid = [] | |
| dest_real = os.path.realpath(dest) | |
| for member in members: | |
| if member.issym() or member.islnk(): | |
| raise tarfile.ExtractError(f"Refusing to extract link: {member.name}") | |
| if os.path.isabs(member.name): | |
| raise tarfile.ExtractError(f"Refusing absolute path: {member.name}") | |
| parts = member.name.split("/") | |
| if ".." in parts: | |
| raise tarfile.ExtractError(f"Refusing parent traversal: {member.name}") | |
| if strip: | |
| stripped = "/".join(parts[strip:]) | |
| if not stripped: | |
| continue | |
| member.name = stripped | |
| target = os.path.realpath(os.path.join(dest, member.name)) | |
| if not target.startswith(dest_real + os.sep): | |
| raise tarfile.ExtractError(f"Refusing path outside destination: {member.name}") | |
| valid.append(member) | |
| tar.extractall(path=dest, members=valid) |
Prompt To Fix With AI
This is a comment left during a code review.
Path: agent/site.py
Line: 147-168
Comment:
`dest_real` is recomputed via `os.path.realpath` on every loop iteration. Since `dest` never changes within the loop, this should be moved above the loop to avoid repeated syscalls proportional to archive member count.
```suggestion
with tarfile.open(path) as tar:
members = tar.getmembers()
valid = []
dest_real = os.path.realpath(dest)
for member in members:
if member.issym() or member.islnk():
raise tarfile.ExtractError(f"Refusing to extract link: {member.name}")
if os.path.isabs(member.name):
raise tarfile.ExtractError(f"Refusing absolute path: {member.name}")
parts = member.name.split("/")
if ".." in parts:
raise tarfile.ExtractError(f"Refusing parent traversal: {member.name}")
if strip:
stripped = "/".join(parts[strip:])
if not stripped:
continue
member.name = stripped
target = os.path.realpath(os.path.join(dest, member.name))
if not target.startswith(dest_real + os.sep):
raise tarfile.ExtractError(f"Refusing path outside destination: {member.name}")
valid.append(member)
tar.extractall(path=dest, members=valid)
```
How can I resolve this? If you propose a fix, please make it concise.Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!
This is an automatic backport of pull request #540 done by [Mergify](https://mergify.com).