diff --git a/configs/sandbox-server/code_config.json b/configs/sandbox-server/code_config.json new file mode 100644 index 0000000..22bd72a --- /dev/null +++ b/configs/sandbox-server/code_config.json @@ -0,0 +1,21 @@ +{ + "server": { + "url": "http://127.0.0.1:18890", + "port": 18890, + "session_ttl": 300 + }, + "resources": { + "code": { + "enabled": true, + "description": "Lightweight coding backend powered by vendored internal tools", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": "/tmp/agentflow_code" + } + } + }, + "warmup": { + "enabled": false, + "resources": [] + } +} diff --git a/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md new file mode 100644 index 0000000..e282aed --- /dev/null +++ b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md @@ -0,0 +1,727 @@ +# Code Backend Single-Repository Vendoring Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make AgentFlow's `code` backend self-contained by vendoring the six upstream-style code tools into this repository, removing `claude_code_root`/`allow_bash`/`bash_timeout_seconds`, and adding an opt-in real rollout smoke. + +**Architecture:** Keep `code` as a session-scoped sandbox backend with AgentFlow-owned workspace lifecycle and path-boundary enforcement. Replace all external-root loading with a small internal `code_vendor` package, route all six tools through the same vendored `tool.call(...)` path, and verify the result with updated sandbox tests plus an MCP-style env-gated real rollout smoke. + +**Tech Stack:** Python 3.10, pytest, FastAPI sandbox server, pathlib/shutil, vendored upstream-style tool classes, RolloutPipeline, real LLM smoke via env-gated pytest collection + pytest CLI options + +--- + +**Known baseline:** In this worktree, `PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py rollout/tests/test_config.py rollout/tests/test_integration.py` passes (`68 passed, 2 skipped`). `pip install -r requirements.txt` still hits the existing unrelated `pyxcursor` dependency-resolution issue from the VM stack; do not block this plan on that package. + +## File Map + +### New files + +- `sandbox/server/backends/resources/code_vendor/__init__.py` + Internal package export surface for the vendored six-tool compatibility layer. +- `sandbox/server/backends/resources/code_vendor/tool.py` + Minimal vendored `Tool` base class used by the six code tools. +- `sandbox/server/backends/resources/code_vendor/file_tools.py` + Vendored upstream-style `ReadTool`, `GlobTool`, `GrepTool`, and `BashTool`. +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + Vendored upstream-style `EditTool` and `WriteTool`. +- `sandbox/tests/test_code_vendor_tools.py` + Focused behavior-contract tests for the vendored tool package independent of `CodeBackend`. +- `rollout/tests/conftest.py` + Mirror the MCP real-smoke collection-gating pattern for `code` rollout tests and add pytest CLI options for real-smoke credentials. +- `rollout/tests/test_code_real_smoke.py` + Opt-in real rollout smoke that starts sandbox, uses the real LLM path, and proves at least one real `code:*` call happens against a temporary fixture repo. + +### Modified files + +- `sandbox/server/backends/resources/code.py` + Remove external-root logic and `bash` special casing; load vendored tools directly and execute all six through the same code path. +- `configs/sandbox-server/code_config.json` + Remove deleted config fields and present the `code` backend as a native AgentFlow capability with only `workspace_root`. +- `sandbox/tests/test_code_backend.py` + Delete obsolete external-root/bash-wrapper tests, keep valid workspace/boundary coverage, and rewrite tool-loading expectations around internal vendoring. +- `sandbox/tool_schemas/code_tools.py` + Update `code-bash` description to remove backend-config-dependent availability wording. +- `sandbox/tests/test_code_tool_schemas.py` + Update schema assertions to match the new `code-bash` description and keep the rest of the parameter contract coverage. +- `sandbox/tests/test_sandbox_config_loading.py` + Replace the `CLAUDE_CODE_ROOT` env-expansion test with a config-loading assertion for the simplified `code` backend template. + +### Intentionally unchanged files + +- `rollout/core/config.py` +- `rollout/core/runner.py` +- `rollout/pipeline.py` +- `sandbox/tool_schemas/__init__.py` +- `sandbox/server/backends/resources/__init__.py` + +The rollout and backend registration plumbing already supports the target design. Do not widen scope into rollout engine rewrites or unrelated backend refactors. + +## Chunk 1: Vendor the Upstream-Style Tool Subset + +### Task 1: Add the internal `code_vendor` package and behavior-contract tests + +**Files:** +- Create: `sandbox/server/backends/resources/code_vendor/__init__.py` +- Create: `sandbox/server/backends/resources/code_vendor/tool.py` +- Create: `sandbox/server/backends/resources/code_vendor/file_tools.py` +- Create: `sandbox/server/backends/resources/code_vendor/edit_tools.py` +- Create: `sandbox/tests/test_code_vendor_tools.py` + +- [ ] **Step 1: Write the failing vendored-tool tests** + +Create `sandbox/tests/test_code_vendor_tools.py` with focused tests like: + +```python +import asyncio +from pathlib import Path +from types import SimpleNamespace + +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "demo.py" + target.write_text("first\nsecond\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run(ReadTool().call({"file_path": str(target)}, ctx)) + + assert "1" in result + assert "first" in result + assert "second" in result + + +def test_edit_tool_requires_unique_match(tmp_path): + target = tmp_path / "demo.py" + target.write_text("x = 1\nx = 1\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + EditTool().call( + { + "file_path": str(target), + "old_string": "x = 1", + "new_string": "x = 2", + }, + ctx, + ) + ) + + assert result.startswith("Error:") + assert "appears" in result + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + BashTool().call( + { + "command": "python -c \"import sys; print('out'); print('err', file=sys.stderr)\"" + }, + ctx, + ) + ) + + assert result == "out\n\n[stderr]:\nerr" +``` + +- [ ] **Step 2: Run the vendored-tool tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: FAIL because the `code_vendor` package does not exist yet. + +- [ ] **Step 3: Add the vendored tool package** + +Create `sandbox/server/backends/resources/code_vendor/tool.py` with a minimal base: + +```python +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict: + ... + + @abstractmethod + async def call(self, args: dict, ctx) -> str: + ... + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Create `sandbox/server/backends/resources/code_vendor/file_tools.py` with vendored upstream-style implementations: + +```python +from __future__ import annotations + +import subprocess +from pathlib import Path + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict: + return { + "type": "object", + "properties": {"command": {"type": "string", "description": "Shell command to run"}}, + "required": ["command"], + } + + async def call(self, args: dict, ctx) -> str: + result = subprocess.run( + args["command"], + shell=True, + capture_output=True, + text=True, + cwd=ctx.cwd, + ) + out = result.stdout + if result.stderr: + out += f"\n[stderr]:\n{result.stderr}" + return out.strip() or "(no output)" + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Add matching vendored implementations for `ReadTool`, `GlobTool`, `GrepTool`, `EditTool`, and `WriteTool`, preserving the current upstream-style semantics already described in the approved spec. Keep imports package-local only; do not carry over `log.py`, `trace.py`, or a vendored tool executor. + +Create `sandbox/server/backends/resources/code_vendor/__init__.py` to export the six tool classes. + +- [ ] **Step 4: Run the vendored-tool tests** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: PASS. The vendored tool package exists and captures the expected upstream-style behavior. + +- [ ] **Step 5: Commit the vendored tool package** + +```bash +git add sandbox/server/backends/resources/code_vendor/__init__.py \ + sandbox/server/backends/resources/code_vendor/tool.py \ + sandbox/server/backends/resources/code_vendor/file_tools.py \ + sandbox/server/backends/resources/code_vendor/edit_tools.py \ + sandbox/tests/test_code_vendor_tools.py +git commit -m "feat: vendor code backend tool subset" +``` + +## Chunk 2: Simplify `CodeBackend` to Use Vendored Tools Only + +### Task 2: Rewrite `CodeBackend` around the internal tool package + +**Files:** +- Modify: `sandbox/server/backends/resources/code.py` +- Modify: `sandbox/tests/test_code_backend.py` +- Modify: `configs/sandbox-server/code_config.json` + +- [ ] **Step 1: Rewrite the backend tests around the new design** + +In `sandbox/tests/test_code_backend.py`: + +- delete external-root helper factories such as `create_fake_claude_code_root()` and `create_marker_claude_code_root()` +- delete all tests whose only purpose is external-root loading, root-local support modules, or per-root loader isolation +- delete all `allow_bash` and `bash_timeout_seconds` tests +- add/keep failing tests like: + +```python +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def test_initialize_does_not_require_external_root(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + session = asyncio.run(backend.initialize("runner_123", {})) + + assert Path(session["workspace"]).exists() + + +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + tools = backend._load_code_tools() + + assert set(tools) == {"read", "glob", "grep", "bash", "edit", "write"} + assert tools["bash"].__class__.__module__.endswith("code_vendor.file_tools") + + +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + fake_server = FakeServer() + backend.bind_server(fake_server) + workspace = tmp_path / "agentflow_code" / "worker-1" + workspace.mkdir(parents=True) + + executor = ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter( + {"session_id": "sid", "data": {"workspace": str(workspace)}} + ), + ) + + result = asyncio.run( + executor.execute( + action="code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-1", + ) + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"].strip() == str(workspace.resolve(strict=False)) +``` + +- [ ] **Step 2: Run the rewritten backend tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py -k "does_not_require_external_root or internal_vendor_package or runs_bash_via_vendored_tool" +``` + +Expected: FAIL because `CodeBackend` still depends on `claude_code_root` and still special-cases `bash`. + +- [ ] **Step 3: Rewrite `sandbox/server/backends/resources/code.py`** + +Update `CodeBackend` to: + +- keep only `workspace_root` in its default config +- rename the internal tool loader to something neutral like `_load_code_tools()` +- import vendored classes directly, for example: + +```python +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +``` + +- cache vendored instances per backend instance: + +```python +self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), +} +``` + +- remove these methods entirely: + - `_get_claude_code_root()` + - `_validate_claude_code_root_prerequisites()` + - `_load_root_support_modules()` + - `_temporary_module_aliases()` + - `_load_module_from_path()` + - `_run_bash_command()` + +- remove any `tool_name == "bash"` branch in `_dispatch()` +- after session/workspace validation and path normalization, always run: + +```python +tool = self._load_code_tools()[tool_name] +ctx = SimpleNamespace(cwd=str(workspace)) +result = await tool.call(normalized_params, ctx) +``` + +- keep AgentFlow-owned path normalization and workspace identity enforcement exactly as the valid existing tests expect + +Update `configs/sandbox-server/code_config.json` so the `code` backend config becomes: + +```json +"config": { + "workspace_root": "/tmp/agentflow_code" +} +``` + +and update the description string to describe the backend as vendored/internal rather than powered by an external repository. + +- [ ] **Step 4: Run the backend regression subset** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py +``` + +Expected: PASS after the obsolete tests/helpers are removed and the remaining coverage is adapted to the internal vendored model. + +- [ ] **Step 5: Commit the backend simplification** + +```bash +git add sandbox/server/backends/resources/code.py \ + sandbox/tests/test_code_backend.py \ + configs/sandbox-server/code_config.json +git commit -m "refactor: vendor code backend runtime" +``` + +## Chunk 3: Refresh Schema and Config Tests Around the New Contract + +### Task 3: Update schema docs, config-loading tests, and obsolete assertions + +**Files:** +- Modify: `sandbox/tool_schemas/code_tools.py` +- Modify: `sandbox/tests/test_code_tool_schemas.py` +- Modify: `sandbox/tests/test_sandbox_config_loading.py` + +- [ ] **Step 1: Write the failing schema/config assertions** + +Update the tests to the new expected contract: + +```python +def test_code_bash_description_mentions_workspace_shell_execution(): + schema = _code_schemas_by_name()["code-bash"] + description = schema["description"].lower() + + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description + + +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): + config_path = tmp_path / "code_config.json" + raw_config = { + "resources": { + "code": { + "enabled": True, + "config": { + "workspace_root": "/tmp/agentflow_code" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" +``` + +- [ ] **Step 2: Run the schema/config tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py +``` + +Expected: FAIL because the current schema text still mentions backend-config-dependent availability and the config-loading test still asserts `CLAUDE_CODE_ROOT` expansion. + +- [ ] **Step 3: Update schema text and config-loading coverage** + +In `sandbox/tool_schemas/code_tools.py`, change `code-bash` to something like: + +```python +{ + "name": "code-bash", + "description": "Run a shell command in the coding workspace using the current workspace as the working directory.", + "parameters": [ + { + "name": "command", + "type": "string", + "description": "Shell command to execute.", + "required": True, + } + ], +} +``` + +In `sandbox/tests/test_sandbox_config_loading.py`, replace the `CLAUDE_CODE_ROOT` test with the simplified `workspace_root` expectation and remove `monkeypatch.delenv("CLAUDE_CODE_ROOT", ...)`. + +- [ ] **Step 4: Run the updated schema/config regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py +``` + +Expected: PASS. The code tool docs and config-loading tests now reflect the new single-repository contract. + +- [ ] **Step 5: Commit the schema/config cleanup** + +```bash +git add sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py +git commit -m "test: align code backend schema and config coverage" +``` + +## Chunk 4: Add the Opt-In Real Rollout Smoke + +### Task 4: Add MCP-style env-gated real `code` rollout smoke support + +**Files:** +- Create: `rollout/tests/conftest.py` +- Create: `rollout/tests/test_code_real_smoke.py` + +- [ ] **Step 1: Write the smoke test first** + +Create `rollout/tests/test_code_real_smoke.py` with a real-smoke shape like: + +```python +import json +from pathlib import Path + +from rollout import RolloutConfig, RolloutPipeline + + +def test_code_real_rollout_smoke(tmp_path, code_real_settings): + fixture_repo = tmp_path / "fixture_repo" + fixture_repo.mkdir() + nested = fixture_repo / "nested" + nested.mkdir() + token = "AF_CODE_SMOKE_TOKEN_7F3A91" + (nested / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": "Use the available code tools to read nested/TOKEN.txt. Reply with only the exact token.", + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + output_dir=str(tmp_path / "out"), + model_name=code_real_settings["model"], + api_key=code_real_settings["api_key"], + base_url=code_real_settings["base_url"], + max_turns=5, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={"code": {"content": {"source_dir": str(fixture_repo)}}}, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + number_of_tasks=1, + ) + + summary = RolloutPipeline(config, output_dir=str(tmp_path / "out")).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 +``` + +- [ ] **Step 2: Run collection to verify the smoke currently fails** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: FAIL because the new smoke file or its `code_real_settings` fixture does not exist yet. + +- [ ] **Step 3: Add MCP-style collection gating and credential CLI options** + +Create `rollout/tests/conftest.py`: + +```python +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def pytest_addoption(parser): + parser.addoption("--real-api-key", action="store", default="") + parser.addoption("--real-base-url", action="store", default="") + parser.addoption("--real-model", action="store", default="") + + +def pytest_ignore_collect(collection_path, config): + if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +@pytest.fixture +def code_real_settings(request): + api_key = request.config.getoption("--real-api-key") + base_url = request.config.getoption("--real-base-url") + model = request.config.getoption("--real-model") + if not api_key or not base_url or not model: + pytest.skip( + "Provide --real-api-key, --real-base-url, and --real-model to run code_real smoke tests." + ) + return {"api_key": api_key, "base_url": base_url, "model": model} +``` + +Then complete `rollout/tests/test_code_real_smoke.py` so it also: + +- locates the results file written by `RolloutPipeline` +- loads the single saved result +- asserts there is at least one `code:*` tool call in the trajectory +- asserts the final answer equals the unique token +- asserts the token appears in the tool-result chain + +- [ ] **Step 4: Verify collection gating behavior** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: no tests collected from this file unless opt-in is enabled, matching the MCP real-smoke pattern. + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: PASS. The real-smoke file is now collected explicitly, matching MCP's env-gated behavior. + +- [ ] **Step 5: Commit the real-smoke scaffolding** + +```bash +git add rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "test: add opt-in code rollout real smoke" +``` + +## Chunk 5: Final Verification and Live Smoke Run + +### Task 5: Run the full targeted regression suite and the real smoke with supplied credentials + +**Files:** +- No code changes expected + +- [ ] **Step 1: Run the full targeted regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py \ + rollout/tests/test_integration.py +``` + +Expected: PASS. The vendored tool package, the simplified backend, schema/config coverage, and rollout baseline all pass together. + +- [ ] **Step 2: Run the real rollout smoke with explicit credentials** + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py \ + --real-api-key '' \ + --real-base-url '' \ + --real-model '' \ + -s +``` + +Expected: PASS. Sandbox starts, the `code` session is created, at least one real `code:*` tool call occurs, and the final answer matches the unique token from the fixture repo. + +- [ ] **Step 3: Inspect the real-smoke output** + +Verify in the saved trajectory/result payload that: + +- the trajectory contains at least one `code:*` tool call +- at least one `code:read` or `code:glob` appears +- the token from `nested/TOKEN.txt` is present in tool-result observations +- the final predicted answer equals the token exactly + +- [ ] **Step 4: Commit the integrated result** + +```bash +git status --short +git add sandbox/server/backends/resources/code_vendor \ + sandbox/server/backends/resources/code.py \ + configs/sandbox-server/code_config.json \ + sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "refactor: vendor code backend tools into agentflow" +``` + +- [ ] **Step 5: Record final verification notes** + +Capture: + +- exact targeted pytest command and pass result +- exact `AGENTFLOW_RUN_CODE_REAL=1 ... pytest ...` command used +- whether the real smoke passed +- any residual risk, especially around powerful `bash` behavior and live-model variability + +## Execution Notes + +- Use `PYTHONPATH=.` for pytest commands in this repository unless the execution harness already injects the repo root. +- Keep path-boundary enforcement in `CodeBackend`; do not push it into the vendored tool files. +- Do not leave any `claude_code_root`, `allow_bash`, or `bash_timeout_seconds` references behind in tests or config templates. +- Do not widen the real-smoke scope into general rollout refactoring. + +Plan complete and saved to `docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md`. Ready to execute? diff --git a/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md new file mode 100644 index 0000000..7085867 --- /dev/null +++ b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md @@ -0,0 +1,356 @@ +## Code Backend Single-Repository Vendoring Design + +Date: 2026-04-15 +Status: Approved for planning +Supersedes: `docs/superpowers/specs/2026-04-15-code-backend-design.md` + +## Summary + +AgentFlow's current `code` backend depends on an external `claude-code-py` source tree through `claude_code_root`. That makes the feature non-portable, couples runtime behavior to a closed or separately managed repository, and introduces an avoidable configuration requirement. + +This design replaces that approach with an internal vendored compatibility layer inside AgentFlow. AgentFlow will vendor the minimal upstream code-tool subset it actually needs, and the `code` backend will load those vendored classes directly from the AgentFlow repository. + +The backend remains a session-scoped resource backend with per-worker workspaces and AgentFlow-owned path-boundary enforcement. The six exposed coding tools remain: + +- `code:read` +- `code:glob` +- `code:grep` +- `code:bash` +- `code:edit` +- `code:write` + +Unlike the current implementation, all six tools, including `bash`, will execute through vendored upstream-style tool classes. AgentFlow will stop treating `bash` as a separately wrapped special case. + +## Problem Statement + +The current design has three architectural problems: + +- It requires `claude_code_root` in sandbox config, which breaks single-repository portability. +- It relies on dynamic source loading from another tree, which is brittle and hard to reason about. +- It contains an internal inconsistency: `code:bash` is nominally part of the reused six-tool set, but in practice it bypasses the loaded upstream `BashTool` and runs through an AgentFlow-specific subprocess wrapper. + +These are not desirable "advanced configuration" choices. They are design mistakes for a feature that should ship as a self-contained AgentFlow capability. + +## Goals + +- Make the `code` backend runnable from the AgentFlow repository alone. +- Keep the six coding tools behaviorally aligned with the upstream lightweight tool implementations. +- Remove all dependency on external `claude-code-py` runtime paths and dynamic import plumbing. +- Keep `code` as a session-scoped backend with isolated worker workspaces. +- Preserve AgentFlow's existing rollout and sandbox abstractions. +- Add a clear test strategy that covers unit behavior, backend integration, and a real rollout smoke path. + +## Non-Goals + +- Do not vendor the full `claude-code-py` runtime. +- Do not vendor query loops, skills, tracing, memory loading, or sub-agent functionality. +- Do not add hard shell sandboxing. +- Do not redesign rollout configuration, sandbox protocols, or tool schema conventions. +- Do not keep backward compatibility for `claude_code_root`, `allow_bash`, or `bash_timeout_seconds`. + +## Core Decisions + +### 1. Vendor a minimal compatibility layer + +AgentFlow will vendor only the minimal code-tool slice needed for the `code` backend: + +- a minimal `Tool` base class +- `ReadTool` +- `GlobTool` +- `GrepTool` +- `BashTool` +- `EditTool` +- `WriteTool` + +The vendored code should be a small, clearly bounded package inside AgentFlow, with only the minimum import adjustments required to make it internal and self-contained. + +### 2. Remove the external-root model completely + +The new design deletes the idea that AgentFlow should discover coding tools from another source tree at runtime. + +Delete these concepts from implementation, config, tests, and docs: + +- `claude_code_root` +- dynamic import of upstream files +- root-local support-module loading +- compatibility tests that verify loading from an external tree + +This is an intentional removal, not a soft deprecation. + +### 3. Treat `bash` as a normal member of the six-tool set + +The vendored `BashTool` will be used the same way as the other five vendored tools: through the common tool-loading path and `tool.call(params, ctx)` execution model. + +Delete these concepts from implementation, config, tests, and docs: + +- `allow_bash` +- `bash_timeout_seconds` +- AgentFlow-specific `_run_bash_command()` behavior +- config-availability messaging for `code-bash` + +The `code` backend will expose all six tools all the time. + +### 4. Keep AgentFlow-owned environment boundaries + +Vendoring the tool classes does not move workspace safety into the vendored code. AgentFlow still owns: + +- per-worker workspace creation +- `source_dir` copying +- worker/session identity checks +- file-path normalization relative to workspace +- path-escape rejection for file-oriented tools + +This separation keeps the vendored code small and keeps environment policy at the backend boundary where AgentFlow already owns session state. + +## Architecture + +### Vendored package layout + +Add a dedicated internal package for the vendored code tools, for example: + +- `sandbox/server/backends/resources/code_vendor/__init__.py` +- `sandbox/server/backends/resources/code_vendor/tool.py` +- `sandbox/server/backends/resources/code_vendor/file_tools.py` +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + +The package name should make it obvious that this is a bounded internal compatibility layer, not a general-purpose reimplementation of `claude-code-py`. + +### Backend responsibilities + +`CodeBackend` remains responsible for: + +- registering the six `code:*` bridge tools +- creating and cleaning per-worker workspaces +- copying optional `source_dir` contents into the session workspace +- validating the session workspace against the worker id +- enforcing file-path boundaries for file-oriented tools +- instantiating and caching the six vendored tool classes + +`CodeBackend` no longer needs: + +- `_get_claude_code_root()` +- `_validate_claude_code_root_prerequisites()` +- `_load_root_support_modules()` +- dynamic module alias installation +- `_run_bash_command()` +- any `bash`-only dispatch branch + +### Runtime flow + +The runtime flow becomes: + +1. `initialize()` validates `worker_id`, prepares a staged workspace, and optionally copies `source_dir`. +2. The backend ensures vendored tool instances are loaded from the repository itself. +3. The staged workspace becomes the active workspace. +4. Bridge dispatch resolves the worker session workspace. +5. For file-oriented tools, AgentFlow normalizes and bounds path-like parameters to the workspace. +6. The backend creates a minimal context adapter with `cwd=`. +7. All six tools execute through the same vendored `tool.call(...)` path. +8. AgentFlow wraps results into standard backend success/error responses. + +### Minimal context adapter + +The vendored six-tool subset only needs a tiny runtime context: + +```python +SimpleNamespace(cwd=str(workspace)) +``` + +No full agent runtime model is needed. + +## Tool Behavior Contract + +The tool surface remains unchanged: + +- prompt-visible schemas stay `code-read`, `code-glob`, `code-grep`, `code-bash`, `code-edit`, `code-write` +- runtime names stay `code:read`, `code:glob`, `code:grep`, `code:bash`, `code:edit`, `code:write` +- parameter names remain aligned with the vendored upstream tool classes + +Behaviorally, the backend should preserve: + +- line-numbered `read` output +- recursive globbing behavior +- recursive grep behavior with optional file filter +- exact-match edit semantics with uniqueness checks +- full-file overwrite semantics for `write` +- upstream-style shell execution behavior for `bash` + +AgentFlow should not add new `bash`-specific runtime policy once vendoring is complete. + +## Configuration Design + +### Backend config + +After the redesign, `code` backend config should keep only what is still meaningfully owned by AgentFlow: + +- `workspace_root` + +The config example should therefore look like: + +```json +{ + "server": { + "url": "http://127.0.0.1:18890", + "port": 18890, + "session_ttl": 300 + }, + "resources": { + "code": { + "enabled": true, + "description": "Lightweight coding backend with vendored upstream-style tools", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": "/tmp/agentflow_code" + } + } + }, + "warmup": { + "enabled": false, + "resources": [] + } +} +``` + +### Session init config + +Session init remains intentionally small: + +- `source_dir`: optional directory copied into the session workspace + +Rollout-facing use stays: + +- `resource_types=["code"]` +- `available_tools=["code-*"]` +- `resource_init_configs["code"]["content"]["source_dir"]` + +## Testing Strategy + +The testing changes must be explicit. This work is not only about adding tests; it also requires deleting tests and rewriting tests that lock in the old design mistake. + +### Delete old tests + +Delete tests that exist only to validate the old external-root or AgentFlow-specific bash wrapper model, including categories such as: + +- external `claude_code_root` requirement +- fake external upstream roots +- dynamic loading from another repository +- root-local support-module loading +- isolated-per-root loader behavior +- `allow_bash` gating +- `bash_timeout_seconds` +- AgentFlow-specific `bash` input validation that no longer exists in the vendored-upstream model +- config-template checks that still mention deleted fields +- env-var expansion tests whose only purpose was `CLAUDE_CODE_ROOT` + +### Modify existing tests + +Keep and adapt the tests that remain valid for the new architecture: + +- tool registration tests +- workspace initialization and `source_dir` copy tests +- workspace recreation tests +- cleanup safety tests +- `worker_id` validation tests +- session workspace identity and boundary tests +- file-path normalization and escape rejection tests +- successful bridge dispatch and standard response-shape tests +- tool schema presence/filtering/parameter-contract tests + +Schema tests must update descriptions so `code-bash` no longer claims backend-config-dependent availability. + +### Add new tests + +Add new focused tests for the vendored model: + +- vendored tool loading from the internal package +- all six tools executing through the same tool-call path +- vendored `BashTool` behavior contract +- vendored `EditTool`/`WriteTool` behavior contract where existing bridge tests do not already cover it + +### Add a rollout-facing smoke test + +Add one end-to-end rollout smoke that exercises the real rollout-to-sandbox-to-code-backend path. + +The smoke should: + +- live under `rollout/tests/` +- follow the MCP real-smoke opt-in pattern +- not be collected in default pytest execution +- require explicit manual invocation, for example by setting `AGENTFLOW_RUN_CODE_REAL=1` +- use a real LLM response path and a real sandbox/code backend path + +This smoke should not mock sandbox components. It should really: + +- start sandbox +- create a `code` session +- copy a tiny fixture repo into the workspace +- expose `code-*` tools through rollout +- execute at least one real `code:*` tool call + +Recommended smoke structure: + +1. Create a temporary fixture repo with a uniquely identifiable file, for example `nested/TOKEN.txt`. +2. Write a hard-to-guess token into that file. +3. Create a one-task benchmark asking the agent to use code tools to read the file and return only the exact token. +4. Run `RolloutPipeline` with: + - `available_tools=["code-*"]` + - `resource_types=["code"]` + - `resource_init_configs["code"]["content"]["source_dir"]=fixture_repo` + - `sandbox_config_path="configs/sandbox-server/code_config.json"` + - `sandbox_auto_start=True` + - `number_of_tasks=1` + - `evaluate_results=False` + - `save_trajectories=True` +5. Assert that: + - the task succeeds + - the trajectory contains at least one `code:*` tool call + - the final answer equals the token + - the token appears in the observed tool-result chain + +Credential provisioning for that opt-in real smoke remains an execution-time concern and must not be hardcoded into repository defaults. + +## Documentation Changes + +Update all user-facing and internal docs that still describe the deleted design: + +- `configs/sandbox-server/code_config.json` +- `sandbox/tests/test_sandbox_config_loading.py` +- `sandbox/tests/test_code_tool_schemas.py` +- any code-backend README/tutorial snippets +- the prior `2026-04-15-code-backend-design.md` should be treated as superseded + +The resulting documentation should consistently present the `code` backend as a native AgentFlow capability. + +## Risks + +- Vendored code can drift from future upstream changes. + Mitigation: treat the vendored subset as an intentionally frozen internal compatibility layer and cover it with explicit behavior tests. + +- Real rollout smoke tests can be flaky because they depend on live model behavior and external connectivity. + Mitigation: keep them opt-in and strongly constrain the task prompt and fixture. + +- `bash` remains powerful because it executes shell commands relative to the workspace but without OS-level isolation. + Mitigation: document this clearly as an inherent property of the `code` backend rather than disguising it behind partial configuration toggles. + +## Recommended Implementation Order + +1. Vendor the minimal upstream six-tool compatibility layer into AgentFlow. +2. Simplify `CodeBackend` to load vendored tools directly and remove all external-root logic. +3. Remove `bash` special handling so all six tools share one execution path. +4. Simplify `code` backend config to `workspace_root` only. +5. Update schema descriptions and sandbox config examples. +6. Delete old tests tied to the removed design. +7. Adapt retained backend and schema tests. +8. Add vendored-tool behavior coverage. +9. Add the opt-in real rollout smoke. + +## Decision + +AgentFlow should stop treating the `code` backend as a thin adapter over an external source tree and instead ship a self-contained, vendored upstream-style compatibility layer inside the repository. + +This restores the intended product boundary: + +- AgentFlow owns the coding environment as a native feature +- all six code tools are internally available +- rollout and sandbox integration stay unchanged +- the repository becomes portable again diff --git a/rollout/tests/conftest.py b/rollout/tests/conftest.py new file mode 100644 index 0000000..8c6c13d --- /dev/null +++ b/rollout/tests/conftest.py @@ -0,0 +1,134 @@ +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def _code_real_enabled(): + return os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1" + + +def pytest_ignore_collect(collection_path, config): + del config + if _code_real_enabled(): + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +def pytest_addoption(parser): + group = parser.getgroup("agentflow-code-real") + group.addoption( + "--real-api-key", + action="store", + default=None, + help="API key for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-base-url", + action="store", + default=None, + help="Base URL for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-model", + action="store", + default=None, + help="Model name for opt-in real code rollout smoke tests.", + ) + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "code_real: opt-in real code rollout smoke tests", + ) + + +def pytest_collection_modifyitems(config, items): + if _code_real_enabled(): + return + + deselected = [] + kept = [] + for item in items: + if item.get_closest_marker("code_real") is None: + kept.append(item) + continue + + if Path(str(item.fspath)).name in _REAL_CODE_TEST_FILES: + deselected.append(item) + else: + kept.append(item) + + if deselected: + items[:] = kept + config.hook.pytest_deselected(items=deselected) + + +def _get_real_credentials(config): + return { + "api_key": config.getoption("--real-api-key"), + "base_url": config.getoption("--real-base-url"), + "model": config.getoption("--real-model"), + } + + +def _missing_real_credential_options(config): + credentials = _get_real_credentials(config) + return [ + option_name + for option_name, value in ( + ("--real-api-key", credentials["api_key"]), + ("--real-base-url", credentials["base_url"]), + ("--real-model", credentials["model"]), + ) + if not value + ] + + +def pytest_runtest_setup(item): + if item.get_closest_marker("code_real") is None: + return + + if not _code_real_enabled(): + pytest.skip("set AGENTFLOW_RUN_CODE_REAL=1 to run real code rollout smoke tests") + + missing = _missing_real_credential_options(item.config) + if missing: + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + + +@pytest.fixture +def real_llm_credentials(request): + credentials = _get_real_credentials(request.config) + if _missing_real_credential_options(request.config): + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + return credentials + + +@pytest.fixture +def real_api_key(real_llm_credentials): + return real_llm_credentials["api_key"] + + +@pytest.fixture +def real_base_url(real_llm_credentials): + return real_llm_credentials["base_url"] + + +@pytest.fixture +def real_model(real_llm_credentials): + return real_llm_credentials["model"] diff --git a/rollout/tests/test_code_real_smoke.py b/rollout/tests/test_code_real_smoke.py new file mode 100644 index 0000000..9634ec1 --- /dev/null +++ b/rollout/tests/test_code_real_smoke.py @@ -0,0 +1,95 @@ +import json + +import pytest + +from rollout import RolloutConfig, RolloutPipeline + + +pytestmark = pytest.mark.code_real + + +def _canonical_tool_name(name): + for separator in (".", "_", "-"): + if separator in name: + prefix, suffix = name.split(separator, 1) + return f"{prefix}:{suffix}" + return name + + +def test_code_real_smoke_reads_token_via_real_tools( + tmp_path, + real_api_key, + real_base_url, + real_model, +): + fixture_repo = tmp_path / "fixture_repo" + nested_dir = fixture_repo / "nested" + nested_dir.mkdir(parents=True) + + token = f"token-{tmp_path.name}" + (nested_dir / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + prompt = ( + "Use code tools to inspect the repository and read nested/TOKEN.txt. " + "Reply with only the exact token and nothing else." + ) + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": prompt, + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + output_dir = tmp_path / "rollout_output" + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + model_name=real_model, + api_key=real_api_key, + base_url=real_base_url, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={ + "code": {"content": {"source_dir": str(fixture_repo)}} + }, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + max_turns=5, + number_of_tasks=1, + ) + + assert config.max_turns == 5 + + summary = RolloutPipeline(config, output_dir=str(output_dir)).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 + assert summary.failed_tasks == 0 + + result_files = sorted(output_dir.glob("results_code_real_smoke_*.jsonl")) + assert result_files + + payload = json.loads(result_files[-1].read_text(encoding="utf-8").strip()) + trajectory = payload["trajectory"] + tool_calls = trajectory["tool_calls"] + tool_messages = [ + message for message in trajectory["messages"] if message["role"] == "tool" + ] + + assert any( + _canonical_tool_name(call["tool_name"]).startswith("code:") + for call in tool_calls + ) + assert payload["predicted_answer"] == token + assert trajectory["final_answer"] == token + assert any( + token in json.dumps(message, ensure_ascii=False) for message in tool_messages + ) diff --git a/sandbox/server/backends/resources/__init__.py b/sandbox/server/backends/resources/__init__.py index 0042aa9..b958141 100644 --- a/sandbox/server/backends/resources/__init__.py +++ b/sandbox/server/backends/resources/__init__.py @@ -27,7 +27,7 @@ ```python from sandbox.server import HTTPServiceServer from sandbox.server.backends.resources import ( - VMBackend, + VMBackend, RAGBackend ) @@ -59,18 +59,17 @@ ``` """ -from .vm import VMBackend, create_vm_backend -from .rag import RAGBackend, create_rag_backend +from .code import CodeBackend from .mcp import MCPBackend, ToolathlonGymBackend +from .rag import RAGBackend, create_rag_backend +from .vm import VMBackend, create_vm_backend __all__ = [ - # Backend classes "VMBackend", "RAGBackend", "MCPBackend", + "CodeBackend", "ToolathlonGymBackend", - - # Convenience factories "create_vm_backend", "create_rag_backend", ] diff --git a/sandbox/server/backends/resources/code.py b/sandbox/server/backends/resources/code.py new file mode 100644 index 0000000..a4d3248 --- /dev/null +++ b/sandbox/server/backends/resources/code.py @@ -0,0 +1,342 @@ +""" +Code backend skeleton for lightweight coding workspace integration. +""" + +from __future__ import annotations + +import re +import shutil +import time +import uuid +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +from sandbox.server.backends.base import Backend, BackendConfig +from sandbox.server.backends.error_codes import ErrorCode +from sandbox.server.backends.response_builder import ( + build_error_response, + build_success_response, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) + + +class CodeBackend(Backend): + name = "code" + description = "Code Backend - lightweight coding workspace integration" + version = "1.0.0" + + def __init__(self, config: BackendConfig | None = None): + if config is None: + config = BackendConfig( + enabled=True, + default_config={ + "workspace_root": "/tmp/agentflow_code", + }, + description="Code backend", + ) + super().__init__(config) + self._tool_instances: dict[str, Any] | None = None + + def bind_server(self, server) -> None: + super().bind_server(server) + for tool_name in ("read", "glob", "grep", "bash", "edit", "write"): + server.register_tool( + f"code:{tool_name}", + self._make_bridge_tool(tool_name), + resource_type="code", + ) + + async def initialize(self, worker_id: str, config: dict) -> dict: + source_dir = self._resolve_source_dir(config) + workspace, staged_workspace, previous_workspace = self._prepare_workspace(worker_id) + + try: + if source_dir: + self._copy_source_dir(source_dir, staged_workspace) + + self._load_code_tools() + self._commit_prepared_workspace(workspace, staged_workspace, previous_workspace) + except Exception: + if staged_workspace.exists(): + shutil.rmtree(staged_workspace) + if previous_workspace is not None and previous_workspace.exists() and not workspace.exists(): + previous_workspace.rename(workspace) + raise + + return { + "workspace": str(workspace), + "source_dir": str(source_dir) if source_dir else "", + } + + async def cleanup(self, worker_id: str, session_info: dict) -> None: + workspace_value = ((session_info or {}).get("data") or {}).get("workspace") + if not isinstance(workspace_value, str) or not workspace_value.strip(): + return None + + try: + workspace = Path(workspace_value).resolve() + workspace_root = self._get_workspace_root().resolve() + expected_workspace = (workspace_root / self._validate_worker_id(worker_id)).resolve( + strict=False + ) + workspace.relative_to(workspace_root) + except (OSError, RuntimeError, ValueError, TypeError): + return None + + if workspace != expected_workspace: + return None + if workspace.exists() and workspace.is_dir(): + shutil.rmtree(workspace) + return None + + def _get_workspace_root(self) -> Path: + value = self.get_default_config().get("workspace_root") or "/tmp/agentflow_code" + return Path(value) + + def _prepare_workspace(self, worker_id: str) -> tuple[Path, Path, Path | None]: + safe_worker_id = self._validate_worker_id(worker_id) + workspace_root = self._get_workspace_root() + workspace_root.mkdir(parents=True, exist_ok=True) + workspace = workspace_root / safe_worker_id + staged_workspace = workspace_root / f".{safe_worker_id}.staged-{uuid.uuid4().hex}" + previous_workspace = ( + workspace_root / f".{safe_worker_id}.previous-{uuid.uuid4().hex}" + if workspace.exists() + else None + ) + staged_workspace.mkdir(parents=True, exist_ok=False) + return workspace, staged_workspace, previous_workspace + + def _commit_prepared_workspace( + self, + workspace: Path, + staged_workspace: Path, + previous_workspace: Path | None, + ) -> None: + if previous_workspace is not None: + workspace.rename(previous_workspace) + staged_workspace.rename(workspace) + if previous_workspace is not None and previous_workspace.exists(): + shutil.rmtree(previous_workspace) + + def _validate_worker_id(self, worker_id: str) -> str: + if not isinstance(worker_id, str) or not worker_id: + raise ValueError("worker_id must be a non-empty string") + if worker_id in {".", ".."}: + raise ValueError("worker_id contains unsafe path traversal") + if worker_id != Path(worker_id).name: + raise ValueError("worker_id must be a single safe path component") + if not re.fullmatch(r"[A-Za-z0-9._-]+", worker_id): + raise ValueError("worker_id contains unsupported characters") + return worker_id + + def _resolve_source_dir(self, config: dict | None) -> Path | None: + config = config or {} + value = config.get("source_dir") + if not value: + return None + source_dir = Path(value) + if not source_dir.exists(): + raise ValueError(f"source_dir does not exist: {source_dir}") + if not source_dir.is_dir(): + raise ValueError(f"source_dir is not a directory: {source_dir}") + return source_dir + + def _copy_source_dir(self, source_dir: Path, workspace: Path) -> None: + if not source_dir.exists(): + return + for child in source_dir.iterdir(): + destination = workspace / child.name + if child.is_dir(): + shutil.copytree(child, destination, dirs_exist_ok=True) + else: + shutil.copy2(child, destination) + + def _load_code_tools(self) -> dict[str, Any]: + if self._tool_instances is None: + self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), + } + return self._tool_instances + + def _make_bridge_tool(self, tool_name: str): + async def bridge_tool(session_info: dict, **params): + return await self._dispatch(tool_name, session_info, params) + + bridge_tool.__name__ = f"code_{tool_name}" + return bridge_tool + + async def _dispatch( + self, + tool_name: str, + session_info: dict, + params: dict[str, Any], + ) -> dict[str, Any]: + start_time = time.time() + full_name = f"{self.name}:{tool_name}" + session_id = (session_info or {}).get("session_id") + runtime_params = dict(params or {}) + trace_id = runtime_params.pop("trace_id", None) + worker_id = runtime_params.pop("worker_id", None) + runtime_params.pop("session_id", None) + + tool = self._load_code_tools().get(tool_name) + if tool is None: + return build_error_response( + code=ErrorCode.INVALID_REQUEST_FORMAT, + message=f"Unknown code tool: {tool_name}", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + workspace_value = ((session_info or {}).get("data") or {}).get("workspace") + if not isinstance(workspace_value, str) or not workspace_value.strip(): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: missing or empty data.workspace", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + try: + workspace = Path(workspace_value).resolve(strict=False) + workspace_root = self._get_workspace_root().resolve() + expected_workspace = (workspace_root / self._validate_worker_id(worker_id)).resolve( + strict=False + ) + workspace.relative_to(workspace_root) + except (OSError, RuntimeError, ValueError, TypeError): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: must resolve inside workspace_root", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + if workspace != expected_workspace or not workspace.exists() or not workspace.is_dir(): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: must match existing worker workspace", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + ctx = SimpleNamespace(cwd=str(workspace)) + try: + normalized_params = self._normalize_tool_params( + tool_name=tool_name, + params=runtime_params, + workspace=workspace, + ) + except ValueError as exc: + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message=str(exc), + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + try: + result = await tool.call(normalized_params, ctx) + except Exception as exc: + return build_error_response( + code=ErrorCode.EXECUTION_ERROR, + message=str(exc), + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + if isinstance(result, str) and result.startswith("Error:"): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message=result, + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + return build_success_response( + data=result, + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + def _normalize_tool_params( + self, + tool_name: str, + params: dict[str, Any], + workspace: Path, + ) -> dict[str, Any]: + normalized = dict(params) + workspace_path = workspace.resolve(strict=False) + + path_keys: tuple[str, ...] = () + if tool_name in {"read", "edit", "write"}: + path_keys = ("file_path",) + elif tool_name in {"glob", "grep"}: + path_keys = ("path",) + + for key in path_keys: + raw_value = normalized.get(key) + if not isinstance(raw_value, str) or not raw_value: + continue + value_path = Path(raw_value) + if value_path.is_absolute(): + resolved = value_path.resolve(strict=False) + else: + resolved = (workspace_path / value_path).resolve(strict=False) + + try: + resolved.relative_to(workspace_path) + except ValueError as exc: + raise ValueError( + f"Path parameter '{key}' must stay inside workspace" + ) from exc + + normalized[key] = str(resolved) + + if tool_name == "glob": + pattern = normalized.get("pattern") + if ( + isinstance(pattern, str) + and pattern + and re.search(r"(^|[\\/])\.\.([\\/]|$)", pattern) + ): + raise ValueError("Glob pattern must not contain parent traversal segments") + + return normalized diff --git a/sandbox/server/backends/resources/code_vendor/__init__.py b/sandbox/server/backends/resources/code_vendor/__init__.py new file mode 100644 index 0000000..1fc9cfa --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/__init__.py @@ -0,0 +1,11 @@ +from .edit_tools import EditTool, WriteTool +from .file_tools import BashTool, GlobTool, GrepTool, ReadTool + +__all__ = [ + "BashTool", + "EditTool", + "GlobTool", + "GrepTool", + "ReadTool", + "WriteTool", +] diff --git a/sandbox/server/backends/resources/code_vendor/edit_tools.py b/sandbox/server/backends/resources/code_vendor/edit_tools.py new file mode 100644 index 0000000..622658d --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/edit_tools.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class EditTool(Tool): + name = "Edit" + description = ( + "Perform an exact string replacement in a file. " + "old_string must uniquely identify the target location unless replace_all=true." + ) + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "old_string": {"type": "string"}, + "new_string": {"type": "string"}, + "replace_all": {"type": "boolean", "default": False}, + }, + "required": ["file_path", "old_string", "new_string"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + old_string = args["old_string"] + new_string = args["new_string"] + replace_all = args.get("replace_all", False) + + if not path.exists(): + return f"Error: file not found: {path}" + + content = path.read_text(encoding="utf-8") + count = content.count(old_string) + if count == 0: + return f"Error: old_string not found in {path}. Read the file first to verify the exact text." + if count > 1 and not replace_all: + return ( + f"Error: old_string appears {count} times in {path}. " + "Provide more surrounding context to make it unique, or set replace_all=true." + ) + + if replace_all: + updated = content.replace(old_string, new_string) + replacements = count + else: + updated = content.replace(old_string, new_string, 1) + replacements = 1 + + path.write_text(updated, encoding="utf-8") + return f"Replaced {replacements} occurrence(s) in {path}" + + +class WriteTool(Tool): + name = "Write" + description = "Write content to a file, creating parent directories if needed." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "content": {"type": "string"}, + }, + "required": ["file_path", "content"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + content = args["content"] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + line_count = content.count("\n") + if content and not content.endswith("\n"): + line_count += 1 + return f"Wrote {len(content)} bytes ({line_count} lines) to {path}" diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py new file mode 100644 index 0000000..1a61ff2 --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import asyncio +import io +import locale +import os +import signal +import subprocess +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "command": {"type": "string", "description": "Shell command to run"}, + }, + "required": ["command"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + proc = await asyncio.create_subprocess_shell( + args["command"], + shell=True, + cwd=ctx.cwd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, + ) + + try: + stdout_bytes, stderr_bytes = await proc.communicate() + except asyncio.CancelledError: + if proc.returncode is None: + try: + os.killpg(proc.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + proc.kill() + await proc.communicate() + raise + + output = _decode_text_mode_output(stdout_bytes) + stderr = _decode_text_mode_output(stderr_bytes) + if proc.returncode: + return _format_command_error("bash", proc.returncode, output, stderr) + return _format_command_output(output, stderr) + + +def _decode_text_mode_output(data: bytes | None) -> str: + if not data: + return "" + + text_stream = io.TextIOWrapper( + io.BytesIO(data), + encoding=locale.getpreferredencoding(False), + newline=None, + ) + try: + return text_stream.read() + finally: + text_stream.detach() + + +def _format_command_output(stdout: str, stderr: str) -> str: + output = stdout + if stderr: + output += f"\n[stderr]:\n{stderr}" if output else f"[stderr]:\n{stderr}" + return output.strip() or "(no output)" + + +def _format_command_error(tool_name: str, returncode: int, stdout: str, stderr: str) -> str: + if returncode < 0: + status = f"signal {-returncode}" + else: + status = f"exit status {returncode}" + + summary = f"Error: {tool_name} command failed with {status}" + details = _format_command_output(stdout, stderr) + if details == "(no output)": + return summary + return f"{summary}\n{details}" + + +class ReadTool(Tool): + name = "Read" + description = "Read a file and return its contents with line numbers." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "offset": {"type": "integer", "description": "Start line (1-indexed)"}, + "limit": {"type": "integer", "description": "Maximum lines to return"}, + }, + "required": ["file_path"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + if not path.exists(): + return f"Error: file not found: {path}" + + lines = path.read_text(encoding="utf-8").splitlines() + offset = max(0, args.get("offset", 1) - 1) + limit = args.get("limit", 2000) + selected = lines[offset : offset + limit] + return "\n".join( + f"{line_number:4}→{line}" + for line_number, line in enumerate(selected, start=offset + 1) + ) + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GlobTool(Tool): + name = "Glob" + description = "Find files matching a glob pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Glob pattern"}, + "path": {"type": "string", "description": "Directory to search from"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = Path(args.get("path", ctx.cwd)) + pattern = args["pattern"] + matches = sorted(base.glob(pattern)) + return "\n".join(str(match) for match in matches) or "(no matches)" + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GrepTool(Tool): + name = "Grep" + description = "Search file contents with a regex pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Regex pattern"}, + "path": {"type": "string", "description": "Directory to search"}, + "glob": {"type": "string", "description": "Optional file glob filter"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = Path(args.get("path", ctx.cwd)) + cmd = ["grep", "-r", "-n"] + if "glob" in args: + cmd += ["--include", args["glob"]] + cmd += ["--", args["pattern"], str(base)] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + return result.stdout or "(no matches)" + if result.returncode == 1: + return "(no matches)" + return _format_command_error("grep", result.returncode, result.stdout, result.stderr) + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True diff --git a/sandbox/server/backends/resources/code_vendor/tool.py b/sandbox/server/backends/resources/code_vendor/tool.py new file mode 100644 index 0000000..bef7084 --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/tool.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict[str, Any]: + raise NotImplementedError + + @abstractmethod + async def call(self, args: dict[str, Any], ctx: Any) -> str: + raise NotImplementedError + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return False + + def to_api_format(self) -> dict[str, Any]: + return { + "name": self.name, + "description": self.description, + "input_schema": self.input_schema, + } diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py new file mode 100644 index 0000000..fb48625 --- /dev/null +++ b/sandbox/tests/test_code_backend.py @@ -0,0 +1,824 @@ +""" +Tests for the Code backend skeleton and bridge-tool registration. +""" + +import asyncio +import importlib +import os +import shlex +import sys +from pathlib import Path + +import pytest + +from sandbox.server.backends.base import BackendConfig +from sandbox.server.backends.error_codes import ErrorCode +from sandbox.server.config_loader import ConfigLoader +from sandbox.server.core.tool_executor import ToolExecutor + +MODULE_PATH = Path(__file__).resolve().parents[1] / "server" / "backends" / "resources" / "code.py" + + +def remove_resources_modules(): + package_name = "sandbox.server.backends.resources" + for module_name in list(sys.modules): + if module_name == package_name or module_name.startswith(f"{package_name}."): + sys.modules.pop(module_name, None) + + +def load_code_backend_module(): + remove_resources_modules() + return importlib.import_module("sandbox.server.backends.resources.code") + + +class FakeServer: + def __init__(self): + self._tools = {} + self._tool_resource_types = {} + + def register_tool(self, name, func, resource_type=None): + self._tools[name] = func + if resource_type is not None: + self._tool_resource_types[name] = resource_type + + +class FakeResourceRouter: + def __init__(self, session_info): + self._session_info = session_info + + async def get_session(self, worker_id, resource_type): + del worker_id, resource_type + return self._session_info + + async def get_or_create_session(self, worker_id, resource_type, auto_created=False): + del worker_id, resource_type, auto_created + raise AssertionError("unexpected temporary session creation") + + async def refresh_session(self, worker_id, resource_type): + del worker_id, resource_type + return True + + async def destroy_session(self, worker_id, resource_type): + del worker_id, resource_type + return True + + +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def build_backend(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + return module, backend + + +def bind_backend_tools(backend): + fake_server = FakeServer() + backend.bind_server(fake_server) + return fake_server + + +def build_executor(fake_server, session_info): + return ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter(session_info), + ) + + +def execute_tool(executor, action, *, params, worker_id, trace_id): + return asyncio.run( + executor.execute( + action=action, + params=params, + worker_id=worker_id, + trace_id=trace_id, + ) + ) + + +def test_bind_server_registers_code_tools(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + + assert "code:read" in fake_server._tools + assert "code:bash" in fake_server._tools + assert fake_server._tool_resource_types["code:read"] == "code" + assert fake_server._tool_resource_types["code:bash"] == "code" + + +def test_resources_package_exports_eager_backends_without_lazy_machinery(): + remove_resources_modules() + + resources = importlib.import_module("sandbox.server.backends.resources") + code_module = importlib.import_module("sandbox.server.backends.resources.code") + mcp_module = importlib.import_module("sandbox.server.backends.resources.mcp") + + assert resources.CodeBackend is code_module.CodeBackend + assert resources.MCPBackend is mcp_module.MCPBackend + assert not hasattr(resources, "__getattr__") + + +def test_initialize_does_not_require_external_root(tmp_path): + _, backend = build_backend(tmp_path) + + session = asyncio.run(backend.initialize("runner_123", {})) + + assert session["workspace"].endswith("runner_123") + assert Path(session["workspace"]).exists() + + +def test_initialize_copies_source_dir(tmp_path): + _, backend = build_backend(tmp_path) + source_dir = tmp_path / "source" + source_dir.mkdir(parents=True) + (source_dir / "demo.py").write_text("print('hi')\n", encoding="utf-8") + + session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(source_dir)}) + ) + + copied = Path(session["workspace"]) / "demo.py" + assert copied.exists() + assert copied.read_text(encoding="utf-8") == "print('hi')\n" + + +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + _, backend = build_backend(tmp_path) + + tools = backend._load_code_tools() + + assert set(tools.keys()) == {"read", "glob", "grep", "bash", "edit", "write"} + assert type(tools["read"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.file_tools" + ) + assert type(tools["edit"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.edit_tools" + ) + assert backend._load_code_tools() is tools + + +def test_tool_executor_code_dispatch_returns_standard_success_response(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-1", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == " 1→hello from demo" + + +def test_tool_executor_code_dispatch_preserves_trace_id(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-trace", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-preserve-1", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["meta"]["trace_id"] == "trace-preserve-1" + + +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-bash", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == str(runtime_workspace.resolve(strict=False)) + + +def test_tool_executor_returns_business_failure_for_vendored_grep_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + sample_file = runtime_workspace / "sample.txt" + sample_file.write_text("alpha\nbeta\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-grep-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:grep", + params={"pattern": "[", "path": str(runtime_workspace)}, + worker_id="worker-1", + trace_id="trace-grep-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 2" in result["message"] + assert "[stderr]:" in result["message"] + + +def test_tool_executor_returns_business_failure_for_vendored_bash_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:bash", + params={ + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + worker_id="worker-1", + trace_id="trace-bash-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 7" in result["message"] + assert "out" in result["message"] + assert "[stderr]:" in result["message"] + assert "err" in result["message"] + + +def test_tool_executor_non_bash_timeout_uses_standard_error_handling(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + + class TimeoutReadTool: + async def call(self, params, ctx): + del params, ctx + raise asyncio.TimeoutError("read timeout") + + tools = backend._load_code_tools() + backend._tool_instances = dict(tools) + backend._tool_instances["read"] = TimeoutReadTool() + + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-read-timeout", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-read-timeout", + ) + + assert result["code"] == ErrorCode.EXECUTION_ERROR + assert result["message"] == "read timeout" + + +def test_code_write_relative_file_path_resolves_inside_session_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + process_cwd = tmp_path / "process-cwd" + process_cwd.mkdir(parents=True) + prev_cwd = Path.cwd() + os.chdir(process_cwd) + try: + executor = build_executor( + fake_server, + { + "session_id": "code-session-3", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:write", + params={"file_path": "nested/output.txt", "content": "from workspace\n"}, + worker_id="worker-1", + trace_id="trace-1", + ) + finally: + os.chdir(prev_cwd) + + assert result["code"] == ErrorCode.SUCCESS + assert (runtime_workspace / "nested" / "output.txt").read_text(encoding="utf-8") == ( + "from workspace\n" + ) + assert not (process_cwd / "nested" / "output.txt").exists() + + +def test_code_read_error_prefix_is_returned_as_agentflow_error_response(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-4", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "missing.txt"}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + assert result["message"].startswith("Error:") + + +def test_tool_executor_rejects_missing_session_workspace_without_fallback(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + workspace_root = tmp_path / "agentflow_code" + workspace_root.mkdir(parents=True, exist_ok=True) + fallback_file = workspace_root / "fallback.txt" + fallback_file.write_text("must-not-read\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-missing-workspace", + "data": {}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-missing-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_malformed_session_workspace_without_fallback(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + executor = build_executor( + fake_server, + { + "session_id": "code-session-malformed-workspace", + "data": {"workspace": 123}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-malformed-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_nonexistent_session_workspace_under_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + nonexistent_workspace = tmp_path / "agentflow_code" / "worker-1" + executor = build_executor( + fake_server, + { + "session_id": "code-session-nonexistent-workspace", + "data": {"workspace": str(nonexistent_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "demo.py"}, + worker_id="worker-1", + trace_id="trace-nonexistent-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_mismatched_session_workspace_under_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + mismatched_workspace = tmp_path / "agentflow_code" / "other-worker" + mismatched_workspace.mkdir(parents=True) + demo_file = mismatched_workspace / "demo.py" + demo_file.write_text("should-not-read\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-mismatched-workspace", + "data": {"workspace": str(mismatched_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-mismatched-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_session_workspace_outside_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + outside_workspace = tmp_path / "outside-workspace" + outside_workspace.mkdir(parents=True) + demo_file = outside_workspace / "demo.py" + demo_file.write_text("outside\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-outside-workspace", + "data": {"workspace": str(outside_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-outside-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_initialize_recreates_worker_workspace_without_stale_files(tmp_path): + _, backend = build_backend(tmp_path) + first_source = tmp_path / "source-first" + second_source = tmp_path / "source-second" + first_source.mkdir(parents=True) + second_source.mkdir(parents=True) + (first_source / "stale.py").write_text("print('old')\n", encoding="utf-8") + (second_source / "fresh.py").write_text("print('new')\n", encoding="utf-8") + + first_session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(first_source)}) + ) + second_session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(second_source)}) + ) + + assert first_session["workspace"] == second_session["workspace"] + workspace = Path(second_session["workspace"]) + assert not (workspace / "stale.py").exists() + assert (workspace / "fresh.py").exists() + + +def test_code_read_rejects_absolute_path_outside_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + outside_file = tmp_path / "outside.txt" + outside_file.write_text("secret\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-5", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(outside_file)}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + + +def test_code_write_rejects_parent_escape_outside_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + escaped_file = tmp_path / "escaped.txt" + executor = build_executor( + fake_server, + { + "session_id": "code-session-6", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:write", + params={"file_path": "../escaped.txt", "content": "escaped\n"}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + assert not escaped_file.exists() + + +def test_code_glob_rejects_parent_traversal_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-parent-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "../*"}, + worker_id="worker-1", + trace_id="trace-glob-parent-traversal", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "pattern" in result["message"].lower() + + +def test_code_glob_rejects_embedded_parent_traversal_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + (runtime_workspace / "nested").mkdir(parents=True) + (runtime_workspace / "nested" / "demo.py").write_text( + "print('safe')\n", + encoding="utf-8", + ) + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-embedded-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/../*"}, + worker_id="worker-1", + trace_id="trace-glob-embedded-traversal", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "pattern" in result["message"].lower() + + +def test_code_glob_allows_safe_workspace_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + safe_file = runtime_workspace / "nested" / "demo.py" + safe_file.parent.mkdir(parents=True) + safe_file.write_text("print('ok')\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-safe", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/*.py"}, + worker_id="worker-1", + trace_id="trace-glob-safe", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == str(safe_file.resolve(strict=False)) + + +def test_initialize_rejects_hostile_worker_id_without_deleting_outside_dir(tmp_path): + _, backend = build_backend(tmp_path) + outside_dir = tmp_path / "escaped" + outside_dir.mkdir(parents=True) + marker = outside_dir / "keep.txt" + marker.write_text("do-not-delete\n", encoding="utf-8") + + with pytest.raises(ValueError): + asyncio.run(backend.initialize("../escaped", {})) + + assert marker.exists() + + +def test_initialize_rejects_nonexistent_source_dir(tmp_path): + _, backend = build_backend(tmp_path) + missing_source = tmp_path / "missing-source" + + with pytest.raises(ValueError, match="source_dir"): + asyncio.run(backend.initialize("runner_123", {"source_dir": str(missing_source)})) + + +def test_initialize_invalid_source_dir_leaves_no_workspace(tmp_path): + _, backend = build_backend(tmp_path) + missing_source = tmp_path / "missing-source" + workspace = tmp_path / "agentflow_code" / "runner_123" + + with pytest.raises(ValueError, match="source_dir"): + asyncio.run(backend.initialize("runner_123", {"source_dir": str(missing_source)})) + + assert not workspace.exists() + + +def test_cleanup_removes_worker_workspace(tmp_path): + _, backend = build_backend(tmp_path) + session = asyncio.run(backend.initialize("runner_123", {})) + workspace = Path(session["workspace"]) + + assert workspace.exists() + asyncio.run(backend.cleanup("runner_123", {"data": {"workspace": str(workspace)}})) + + assert not workspace.exists() + + +def test_cleanup_does_not_delete_workspace_outside_root(tmp_path): + _, backend = build_backend(tmp_path) + outside_workspace = tmp_path / "outside-workspace" + outside_workspace.mkdir(parents=True) + + asyncio.run( + backend.cleanup("runner_123", {"data": {"workspace": str(outside_workspace)}}) + ) + + assert outside_workspace.exists() + + +def test_cleanup_does_not_delete_nested_under_root_non_worker_path(tmp_path): + _, backend = build_backend(tmp_path) + nested_workspace = tmp_path / "agentflow_code" / "shared" / "cache" + nested_workspace.mkdir(parents=True) + + asyncio.run( + backend.cleanup("runner_123", {"data": {"workspace": str(nested_workspace)}}) + ) + + assert nested_workspace.exists() + + +def test_code_config_template_parses(): + loader = ConfigLoader() + config_path = ( + Path(__file__).resolve().parents[2] + / "configs" + / "sandbox-server" + / "code_config.json" + ) + + config = loader.load(str(config_path)) + + assert "code" in config.resources + assert ( + config.resources["code"].backend_class + == "sandbox.server.backends.resources.code.CodeBackend" + ) + assert config.server.session_ttl == 300 + assert ( + config.resources["code"].description + == "Lightweight coding backend powered by vendored internal tools" + ) + assert config.resources["code"].config == {"workspace_root": "/tmp/agentflow_code"} + assert config.warmup.enabled is False + assert config.warmup.resources == [] + + +def test_create_server_loads_code_backend_via_config_loader(tmp_path): + workspace_root = tmp_path / "agentflow_code" + remove_resources_modules() + loader = ConfigLoader() + loader.load_from_dict( + { + "server": { + "title": "Code backend smoke", + "session_ttl": 300, + }, + "resources": { + "code": { + "enabled": True, + "description": "Code backend", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": str(workspace_root), + }, + } + }, + "warmup": { + "enabled": False, + "resources": [], + }, + } + ) + + server = loader.create_server(host="127.0.0.1", port=0) + resources_package = sys.modules["sandbox.server.backends.resources"] + code_module = sys.modules["sandbox.server.backends.resources.code"] + + assert "code" in server._backends + assert "code:read" in server._tools + assert server._tool_resource_types["code:read"] == "code" + assert Path(resources_package.__file__).resolve() == (MODULE_PATH.parent / "__init__.py").resolve() + assert Path(code_module.__file__).resolve() == MODULE_PATH.resolve() diff --git a/sandbox/tests/test_code_tool_schemas.py b/sandbox/tests/test_code_tool_schemas.py new file mode 100644 index 0000000..6247ee0 --- /dev/null +++ b/sandbox/tests/test_code_tool_schemas.py @@ -0,0 +1,119 @@ +""" +Tests for code tool schemas integration. +""" + +from sandbox.tool_schemas import get_all_tool_names, get_tool_schemas, get_tools_by_resource + + +EXPECTED_CODE_TOOLS = { + "code-read", + "code-glob", + "code-grep", + "code-bash", + "code-edit", + "code-write", +} + + +def _code_schemas_by_name(): + schemas = get_tools_by_resource("code") + return {schema["name"]: schema for schema in schemas} + + +def test_code_tools_visible_in_global_catalog(): + """Code tools should appear in the global tool name catalog.""" + names = get_all_tool_names() + + assert "code-read" in names + assert "code-bash" in names + + +def test_code_wildcard_filtering(): + """Wildcard filtering should support code-* patterns.""" + schemas = get_tool_schemas(["code-*"]) + names = {schema["name"] for schema in schemas} + + assert names == EXPECTED_CODE_TOOLS + + +def test_get_tools_by_resource_code(): + """Resource filtering should return all code tools.""" + schemas = get_tools_by_resource("code") + names = {schema["name"] for schema in schemas} + + assert names == EXPECTED_CODE_TOOLS + + +def test_code_tool_parameter_contract(): + """Each code tool should expose the exact expected parameter contracts.""" + expected_params = { + "code-read": { + ("file_path", "string", True), + ("offset", "integer", False), + ("limit", "integer", False), + }, + "code-glob": { + ("pattern", "string", True), + ("path", "string", False), + }, + "code-grep": { + ("pattern", "string", True), + ("path", "string", False), + ("glob", "string", False), + }, + "code-bash": { + ("command", "string", True), + }, + "code-edit": { + ("file_path", "string", True), + ("old_string", "string", True), + ("new_string", "string", True), + ("replace_all", "boolean", False), + }, + "code-write": { + ("file_path", "string", True), + ("content", "string", True), + }, + } + schemas = _code_schemas_by_name() + + for tool_name, expected in expected_params.items(): + actual = { + (param["name"], param["type"], param["required"]) + for param in schemas[tool_name]["parameters"] + } + assert actual == expected + + +def test_code_read_description_mentions_line_numbered_and_1_indexed_offset(): + """code-read docs should preserve line-numbered output and 1-indexed offset semantics.""" + schema = _code_schemas_by_name()["code-read"] + offset = next( + param for param in schema["parameters"] if param["name"] == "offset" + ) + + assert "line" in schema["description"].lower() + assert "number" in schema["description"].lower() + assert "1-indexed" in offset["description"].lower() + + +def test_code_bash_description_mentions_workspace_shell_execution(): + """code-bash docs should describe shell execution in the coding workspace.""" + schema = _code_schemas_by_name()["code-bash"] + description = schema["description"].lower() + + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description + + +def test_code_write_description_mentions_workspace_full_content_and_parent_dirs(): + """code-write docs should mention writing full content and creating parent directories.""" + schema = _code_schemas_by_name()["code-write"] + description = schema["description"].lower() + + assert "workspace" in description + assert "full file content" in description + assert "parent" in description + assert "director" in description + assert "create" in description diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py new file mode 100644 index 0000000..4bd5893 --- /dev/null +++ b/sandbox/tests/test_code_vendor_tools.py @@ -0,0 +1,326 @@ +import asyncio +import importlib.util +import shlex +import sys +import time +from types import SimpleNamespace +from pathlib import Path + +import pytest + +PACKAGE_DIR = ( + Path(__file__).resolve().parents[1] + / "server" + / "backends" + / "resources" + / "code_vendor" +) + + +def load_code_vendor_module(module_name): + package_name = "_test_code_vendor" + package_spec = importlib.util.spec_from_file_location( + package_name, + PACKAGE_DIR / "__init__.py", + submodule_search_locations=[str(PACKAGE_DIR)], + ) + package = importlib.util.module_from_spec(package_spec) + sys.modules[package_name] = package + assert package_spec is not None + assert package_spec.loader is not None + package_spec.loader.exec_module(package) + + module_spec = importlib.util.spec_from_file_location( + f"{package_name}.{module_name}", + PACKAGE_DIR / f"{module_name}.py", + ) + module = importlib.util.module_from_spec(module_spec) + sys.modules[f"{package_name}.{module_name}"] = module + assert module_spec is not None + assert module_spec.loader is not None + module_spec.loader.exec_module(module) + return module + + +file_tools = load_code_vendor_module("file_tools") +edit_tools = load_code_vendor_module("edit_tools") +tool_module = load_code_vendor_module("tool") + +ReadTool = file_tools.ReadTool +GlobTool = file_tools.GlobTool +GrepTool = file_tools.GrepTool +BashTool = file_tools.BashTool +EditTool = edit_tools.EditTool +WriteTool = edit_tools.WriteTool + + +def make_ctx(tmp_path): + return SimpleNamespace(cwd=str(tmp_path)) + + +def call_tool(tool, args, ctx): + return asyncio.run(tool.call(args, ctx)) + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\n", encoding="utf-8") + + result = call_tool(ReadTool(), {"file_path": str(target)}, make_ctx(tmp_path)) + + assert result == " 1→alpha\n 2→beta\n 3→gamma" + + +def test_read_tool_honors_offset_and_limit(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\ndelta\n", encoding="utf-8") + + result = call_tool( + ReadTool(), + {"file_path": str(target), "offset": 1, "limit": 2}, + make_ctx(tmp_path), + ) + + assert result == " 1→alpha\n 2→beta" + + +def test_edit_tool_requires_unique_match_by_default(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + {"file_path": str(target), "old_string": "beta", "new_string": "BETA"}, + make_ctx(tmp_path), + ) + + assert "appears 2 times" in result + assert "replace_all=true" in result + assert target.read_text(encoding="utf-8") == "alpha\nbeta\nbeta\n" + + +def test_edit_tool_replace_all_updates_each_match(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + { + "file_path": str(target), + "old_string": "beta", + "new_string": "BETA", + "replace_all": True, + }, + make_ctx(tmp_path), + ) + + assert result == f"Replaced 2 occurrence(s) in {target}" + assert target.read_text(encoding="utf-8") == "alpha\nBETA\nBETA\n" + + +def test_write_tool_creates_parent_directories_and_overwrites_full_file(tmp_path): + target = tmp_path / "nested" / "dir" / "sample.txt" + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("stale content that should disappear\n", encoding="utf-8") + + result = call_tool( + WriteTool(), + {"file_path": str(target), "content": "hello\nworld\n"}, + make_ctx(tmp_path), + ) + + assert result == f"Wrote 12 bytes (2 lines) to {target}" + assert target.read_text(encoding="utf-8") == "hello\nworld\n" + + +def test_glob_tool_returns_sorted_matches(tmp_path): + (tmp_path / "a.py").write_text("print('a')\n", encoding="utf-8") + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "b.py").write_text("print('b')\n", encoding="utf-8") + (pkg / "c.txt").write_text("ignore\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "**/*.py", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{tmp_path / 'a.py'}\n{tmp_path / 'pkg' / 'b.py'}" + + +def test_grep_tool_returns_matches_with_line_numbers_for_filtered_files(tmp_path): + first = tmp_path / "first.txt" + second = tmp_path / "second.txt" + first.write_text("alpha\nbeta\n", encoding="utf-8") + second.write_text("beta\ngamma\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "beta", "path": str(tmp_path), "glob": "*.txt"}, + make_ctx(tmp_path), + ) + + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{first}:2:beta", + f"{second}:1:beta", + } + + +def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): + root_match = tmp_path / "root.txt" + nested_dir = tmp_path / "pkg" / "nested" + nested_dir.mkdir(parents=True) + nested_match = nested_dir / "deep.py" + root_match.write_text("needle at root\n", encoding="utf-8") + nested_match.write_text("first line\nneedle in nested file\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{nested_match}:2:needle in nested file", + f"{root_match}:1:needle at root", + } + + +def test_grep_tool_returns_no_matches_for_exit_code_one(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "missing", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == "(no matches)" + + +def test_grep_tool_returns_error_prefix_for_invalid_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "[", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 2" in result + assert "[stderr]:" in result + + +def test_grep_tool_treats_option_like_pattern_as_search_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("--help\nalpha\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "--help", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:--help\n" + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + "python -c \"import sys; " + "print('out'); " + "print('err', file=sys.stderr)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "out\n\n[stderr]:\nerr" + + +def test_bash_tool_matches_text_mode_newline_normalization(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; sys.stdout.buffer.write(b'a\\r\\nb\\r\\n')\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "a\nb" + + +def test_bash_tool_returns_error_prefix_for_nonzero_exit_status(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 7" in result + assert "out" in result + assert "[stderr]:" in result + assert "err" in result + + +def test_bash_tool_cancellation_stops_background_command(tmp_path): + marker = tmp_path / "marker.txt" + + async def run_bash_with_timeout(): + timeout_start = time.monotonic() + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + BashTool().call( + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import pathlib, time; " + "time.sleep(0.3); " + "pathlib.Path('marker.txt').write_text('created', encoding='utf-8')\"" + ) + }, + make_ctx(tmp_path), + ), + timeout=0.1, + ) + timeout_elapsed = time.monotonic() - timeout_start + await asyncio.sleep(0.4) + return timeout_elapsed + + timeout_elapsed = asyncio.run(run_bash_with_timeout()) + + assert timeout_elapsed < 0.25 + assert not marker.exists() + + +def test_tool_api_format_and_read_only_flags(): + read_tool = ReadTool() + bash_tool = BashTool() + + api_format = read_tool.to_api_format() + + assert api_format["name"] == "Read" + assert isinstance(api_format["description"], str) + assert api_format["input_schema"] == read_tool.input_schema + assert read_tool.is_read_only({}) is True + assert bash_tool.is_read_only({}) is False diff --git a/sandbox/tests/test_sandbox_config_loading.py b/sandbox/tests/test_sandbox_config_loading.py index 16773a5..d57f5cb 100644 --- a/sandbox/tests/test_sandbox_config_loading.py +++ b/sandbox/tests/test_sandbox_config_loading.py @@ -27,3 +27,24 @@ def test_load_server_config_expands_env_default_placeholders(tmp_path, monkeypat loaded["resources"]["mcp"]["config"]["workspace_root"] == "/tmp/agentflow_mcp" ) + + +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): + config_path = tmp_path / "code_config.json" + raw_config = { + "resources": { + "code": { + "enabled": True, + "config": { + "workspace_root": "/tmp/agentflow_code" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + + loaded = sandbox._load_server_config() + + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" diff --git a/sandbox/tool_schemas/__init__.py b/sandbox/tool_schemas/__init__.py index 9ba5139..36a1d16 100644 --- a/sandbox/tool_schemas/__init__.py +++ b/sandbox/tool_schemas/__init__.py @@ -13,6 +13,7 @@ from .doc_tools import get_doc_tool_schemas from .ds_tools import get_ds_tool_schemas from .sql_tools import get_sql_tool_schemas +from .code_tools import get_code_tool_schemas from .mcp import get_mcp_tool_schemas @@ -57,6 +58,7 @@ def get_tool_schemas(allowed_tools: Optional[List[str]] = None) -> List[Dict[str + get_doc_tool_schemas() + get_ds_tool_schemas() + get_sql_tool_schemas() + + get_code_tool_schemas() ) # MCP manifest (438 tools) is expensive to load into every prompt. @@ -118,7 +120,7 @@ def get_tools_by_resource(resource_type: str) -> List[Dict[str, Any]]: Get tools for a specific resource type. Args: - resource_type: Resource type like "vm", "rag", "web", "mcp" + resource_type: Resource type like "vm", "rag", "web", "mcp", "code" Returns: List of tool schemas for that resource @@ -144,5 +146,6 @@ def get_tools_by_resource(resource_type: str) -> List[Dict[str, Any]]: "get_doc_tool_schemas", "get_ds_tool_schemas", "get_sql_tool_schemas", + "get_code_tool_schemas", "get_mcp_tool_schemas", ] diff --git a/sandbox/tool_schemas/code_tools.py b/sandbox/tool_schemas/code_tools.py new file mode 100644 index 0000000..f281b14 --- /dev/null +++ b/sandbox/tool_schemas/code_tools.py @@ -0,0 +1,139 @@ +""" +Code Tool Schemas + +This module defines tool schemas for code workspace operations. +""" + +from typing import List, Dict, Any + + +def get_code_tool_schemas() -> List[Dict[str, Any]]: + """Get all code tool schemas.""" + return [ + { + "name": "code-read", + "description": "Read a text file from the current code workspace and return contents with line numbers.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to read.", + "required": True, + }, + { + "name": "offset", + "type": "integer", + "description": "Optional start line number for partial reads (1-indexed).", + "required": False, + }, + { + "name": "limit", + "type": "integer", + "description": "Optional maximum number of lines to return.", + "required": False, + }, + ], + }, + { + "name": "code-glob", + "description": "Find files in the coding workspace using a glob pattern.", + "parameters": [ + { + "name": "pattern", + "type": "string", + "description": "Glob pattern to match files, such as '**/*.py'.", + "required": True, + }, + { + "name": "path", + "type": "string", + "description": "Optional base directory to search from.", + "required": False, + }, + ], + }, + { + "name": "code-grep", + "description": "Search file contents in the coding workspace with a regex pattern.", + "parameters": [ + { + "name": "pattern", + "type": "string", + "description": "Regex pattern to search for.", + "required": True, + }, + { + "name": "path", + "type": "string", + "description": "Optional directory path to scope the search.", + "required": False, + }, + { + "name": "glob", + "type": "string", + "description": "Optional glob filter for file selection, such as '*.ts'.", + "required": False, + }, + ], + }, + { + "name": "code-bash", + "description": "Run a shell command in the current coding workspace.", + "parameters": [ + { + "name": "command", + "type": "string", + "description": "Shell command to execute.", + "required": True, + }, + ], + }, + { + "name": "code-edit", + "description": "Edit a file in the coding workspace by exact string replacement, expecting a unique match unless replace_all=true.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to edit.", + "required": True, + }, + { + "name": "old_string", + "type": "string", + "description": "Exact text to find in the file.", + "required": True, + }, + { + "name": "new_string", + "type": "string", + "description": "Text used to replace the matched string.", + "required": True, + }, + { + "name": "replace_all", + "type": "boolean", + "description": "When true, replace all exact matches; otherwise exactly one unique match is expected.", + "required": False, + }, + ], + }, + { + "name": "code-write", + "description": "Write full file content to a file in the coding workspace and create parent directories if needed.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to write.", + "required": True, + }, + { + "name": "content", + "type": "string", + "description": "Complete file content to write.", + "required": True, + }, + ], + }, + ]