diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..68ff88b --- /dev/null +++ b/.env.example @@ -0,0 +1,27 @@ +# openstudio-mcp environment variable template. +# Copy to .env and fill in your values — .env is gitignored and must never be committed. + +# --------------------------------------------------------------------------- +# Telemetry (optional — requires pip install 'openstudio-mcp[telemetry]') +# Leave TRACELOOP_BASE_URL unset to disable tracing entirely (zero overhead). +# --------------------------------------------------------------------------- + +# OTLP HTTP endpoint. Examples: +# Local Jaeger: http://localhost:4318 +# Traceloop cloud: https://api.traceloop.com +TRACELOOP_BASE_URL= + +# API key — required only for Traceloop cloud, not for generic OTLP backends. +TRACELOOP_API_KEY= + +# Service name shown on every span. +OTEL_SERVICE_NAME=openstudio-mcp + +# Set to "false" to use synchronous span export (useful in development). +OTEL_EXPORT_BATCH=true + +# IMPORTANT PRIVACY SETTING: when "true" (default), tool arguments and outputs +# — including file paths, model parameters, and simulation results — are +# exported to the OTLP backend. Set to "false" unless you have reviewed the +# data being exported and your backend is self-hosted or trusted. +TRACELOOP_TRACE_CONTENT=false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba4cf19..0fee907 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,15 @@ jobs: mkdir -p runs docker run --rm -v "$PWD:/repo" -v "$PWD/runs:/runs" openstudio-mcp:dev bash -lc 'cd /repo && pytest -vv -m "not integration"' + - name: Smoke-install telemetry extra + # Validates that traceloop-sdk and its deps install cleanly from the + # pinned constraint in [telemetry]. Catches packaging drift that would + # make openstudio-mcp[telemetry] uninstallable without needing a full + # Docker rebuild. + run: | + docker run --rm -v "$PWD:/repo" openstudio-mcp:dev bash -lc \ + 'pip install --quiet -e "/repo[telemetry]" && python -c "from traceloop.sdk import Traceloop; print(\"traceloop-sdk OK\")"' + - name: Save Docker image run: docker save openstudio-mcp:dev | gzip > /tmp/image.tar.gz @@ -66,8 +75,8 @@ jobs: EXTRA_ENV="-e MCP_OSW_PATH=tests/assets/SEB_model/SEB4_baseboard/workflow.osw -e EXPECTED_EUI=1.8750760248144998 -e EXPECTED_EUI_RTOL=0.02 -e EXPECTED_EUI_ATOL=0.0" ;; 2) - # common_measures, hvac_systems, geometry, zone terminal, skill_energy_report - FILES="tests/test_common_measures.py tests/test_hvac_systems.py tests/test_replace_zone_terminal.py tests/test_geometry.py tests/test_skill_energy_report.py" + # common_measures, hvac_systems, geometry, zone terminal, skill_energy_report, ecm_package + FILES="tests/test_common_measures.py tests/test_hvac_systems.py tests/test_replace_zone_terminal.py tests/test_geometry.py tests/test_skill_energy_report.py tests/test_skill_ecm_package.py" EXTRA_ENV="" ;; 3) @@ -82,7 +91,7 @@ jobs: ;; 5) # HVAC supply sim smoke tests + hvac_validation + bar_building + concurrent regression - FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py tests/test_concurrent_tools.py tests/test_stdout_logger_silence.py" + FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py tests/test_concurrent_tools.py tests/test_stdout_logger_silence.py tests/test_telemetry.py" EXTRA_ENV="" ;; esac diff --git a/.gitignore b/.gitignore index 8396c12..7c70477 100644 --- a/.gitignore +++ b/.gitignore @@ -33,5 +33,11 @@ Thumbs.db # Code review artifacts docs/review/ +# Environment / secrets +.env +.env.* +!.env.example + # Codex CLI .codex/ +.mcp.json diff --git a/.mcp.json.example b/.mcp.json.example new file mode 100644 index 0000000..d69202c --- /dev/null +++ b/.mcp.json.example @@ -0,0 +1,15 @@ +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/ABSOLUTE/PATH/TO/inputs:/inputs", + "-v", "/ABSOLUTE/PATH/TO/runs:/runs", + "-v", "/ABSOLUTE/PATH/TO/openstudio-mcp/.claude/skills:/skills:ro", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} diff --git a/CHANGELOG.md b/CHANGELOG.md index 301c256..888399c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## [Unreleased] + +### Added +- **Optional OpenLLMetry tracing**: `pip install 'openstudio-mcp[telemetry]'` + `TRACELOOP_BASE_URL` env var enables distributed tracing via traceloop-sdk. Zero overhead when unset. Key operations (`run_simulation`, `apply_measure`, `create_measure`, `create_*_building`, `run_qaqc_checks`) emit named spans; every FastMCP tool call is auto-instrumented via `McpInstrumentor`. +- **Per-client setup guides**: `docs/clients/` — detailed MCP config examples, tool limits, and performance notes for Claude Code, Claude Desktop, VS Code Copilot, Windsurf, Gemini CLI, and Cursor. +- **Token context performance doc**: `docs/clients/token-context-performance.md` — benchmark of how each client handles the 142-tool surface and context overhead. +- **SECURITY.md**: disclosure policy and supported versions. +- **ECM package example**: `docs/examples/20_deep_retrofit_package.md` — wall insulation + thermostat + window + PV stack with expected EUI ranges. +- **`.mcp.json.example`**: ready-to-use Claude Code MCP config. +- **Docker tracing stack**: `docker/docker-compose.tracing.yml` + `docker/otel-collector-config.yaml` for local Jaeger/OTEL collector. +- **`test_telemetry.py`**: 20 unit tests for telemetry module (no Docker required) — includes startup-wiring and decorator-coverage regression tests. +- **`.env.example`**: template for telemetry environment variables with privacy guidance. +- **`test_stdout_logger_silence.py`**: integration tests verifying Polyhedron/Space Logger warnings are fully suppressed after `silence_openstudio_stdout_logger()`. + +### Fixed +- **ECM package example**: window ECM was incorrectly using `create_standard_opaque_material`; now correctly notes that glazing requires `SimpleGlazing` authored via `create_measure`. +- **README tracing Docker example**: corrected image tag from `openstudio-mcp:dev` to `openstudio-mcp:tracing` (the dev image does not include traceloop-sdk); added build command and explanatory note. +- **`TRACELOOP_TRACE_CONTENT` docs**: expanded to warn that the default (`true`) exports tool arguments and outputs to the OTLP backend; recommends `false` as the safe starting point. +- **`opentelemetry-sdk` version constraint**: tightened `[dev]` lower bound from `>=1.20` to `>=1.38.0` to match `traceloop-sdk`'s actual minimum; prevents pip resolving an incompatible version when both extras are installed. + ## [0.9.0] - 2026-04-10 ### Added diff --git a/README.md b/README.md index 05e406c..ad48b70 100644 --- a/README.md +++ b/README.md @@ -106,19 +106,21 @@ For simulation outputs (results, SQL, HTML reports), these are already in `/runs ### Other MCP Hosts -[VS Code Copilot](https://code.visualstudio.com/), [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Windsurf](https://windsurf.com/), and [Gemini CLI](https://github.com/google-gemini/gemini-cli) also support MCP with similar JSON config. See the [MCP documentation](https://modelcontextprotocol.io/quickstart/user) for host-specific setup. +See **[`docs/clients/`](docs/clients/index.md)** for per-client setup guides with config files, tool limits, and performance notes. ### Client Compatibility -| Client | Status | Notes | -|--------|--------|-------| -| Claude Desktop | Full support | All 142 tools available | -| Claude Code | Full support | ToolSearch auto-defers tools for efficient discovery | -| VS Code Copilot | Compatible | MCP support via config | -| Windsurf | Compatible | Under 100-tool limit | -| Gemini CLI | Compatible | Use includeTools/excludeTools if needed | -| Cursor | Not compatible | 40-tool hard cap — use Windsurf or Claude Code instead | -| OpenAI API | Compatible | Use defer_loading for best results | +| Client | Tool Limit | Status | Guide | +|--------|-----------|--------|-------| +| Claude Code | Unlimited (ToolSearch) | ✅ Best | [claude-code.md](docs/clients/claude-code.md) | +| Claude Desktop | ~100 practical | ✅ Full | [claude-desktop.md](docs/clients/claude-desktop.md) | +| VS Code Copilot | 128 hard | ✅ Full | [vs-code-copilot.md](docs/clients/vs-code-copilot.md) | +| Windsurf | 100 hard | ⚠️ Partial | [windsurf.md](docs/clients/windsurf.md) — manual tool selection required | +| Gemini CLI | 100 soft / 512 API | ⚠️ Partial | [gemini-cli.md](docs/clients/gemini-cli.md) — use `includeTools` | +| Cursor | 40 hard | ❌ Incompatible | [cursor.md](docs/clients/cursor.md) — 40-tool cap | +| OpenAI API | 128 (recommends ~10) | ✅ Compatible | Use `defer_loading` for best results | + +See [token context & performance](docs/clients/token-context-performance.md) for a breakdown of how each client handles the 142-tool surface. --- @@ -532,6 +534,40 @@ In **prod mode**, stdout is reserved exclusively for MCP JSON-RPC messages. Logs --- +## Tracing (OpenLLMetry) + +Distributed tracing via [traceloop-sdk](https://github.com/traceloop/openllmetry) is available as an optional extra. Install it, then set `TRACELOOP_BASE_URL` to enable it: + +```bash +pip install 'openstudio-mcp[telemetry]' +``` + +Or with Docker (requires the tracing image built with `--build-arg TELEMETRY=1`): + +```bash +# Build the tracing-enabled image once: +docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile . + +# Run with tracing enabled: +docker run --rm -i \ + -e TRACELOOP_BASE_URL=http://host.docker.internal:4318 \ + openstudio-mcp:tracing openstudio-mcp +``` + +The standard `openstudio-mcp:dev` image does **not** include `traceloop-sdk`. Using it with `TRACELOOP_BASE_URL` set will log a warning and disable tracing — it will not work silently. + +| Variable | Default | Description | +|----------|---------|-------------| +| `TRACELOOP_BASE_URL` | *(unset — disabled)* | OTLP endpoint, e.g. `http://localhost:4318` or `https://api.traceloop.com` | +| `TRACELOOP_API_KEY` | *(unset)* | API key for Traceloop cloud (not needed for generic OTLP) | +| `OTEL_SERVICE_NAME` | `openstudio-mcp` | Service name on every span | +| `OTEL_EXPORT_BATCH` | `true` | Set `false` for synchronous export in development | +| `TRACELOOP_TRACE_CONTENT` | `true` | **Set `false` to protect privacy** — when `true`, tool arguments and outputs (including file paths and model data) are exported to the OTLP backend. Recommended: start with `false` and enable only if your backend is self-hosted or you have reviewed the data. | + +Tracing is **off by default** and has zero overhead when `TRACELOOP_BASE_URL` is unset. Key operations (`run_simulation`, `apply_measure`, `create_measure`, the three `create_*_building` variants, and `run_qaqc_checks`) emit named spans. Every FastMCP tool call is auto-instrumented via `McpInstrumentor`. + +--- + ## Architecture - **Transport:** stdio (container spawned by host) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..3c95ef7 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,108 @@ +# Security Policy + +## Scope + +This document covers the `openstudio-mcp` MCP server — a container-bound process that gives +AI agents programmatic control of building energy models via the OpenStudio SDK. + +--- + +## Path Safety + +### Allowed Roots + +All file operations (`read_file`, `copy_file`, `run_osw`, `validate_osw`, etc.) are restricted +to a fixed set of allowed path roots enforced by `is_path_allowed()` in `mcp_server/config.py`: + +| Root | Default | Env Override | +|---|---|---| +| `/runs` | Run outputs, simulation artifacts | `OPENSTUDIO_MCP_RUN_ROOT` | +| `/inputs` | User-provided models and weather files | `OPENSTUDIO_MCP_INPUT_ROOT` | +| `/repo` | Server source code (read-only use cases) | — | +| Bundled measures dirs | ComStock and common measures | — | +| Skills dir | Skill Markdown guides | — | + +Any path that resolves (after symlink expansion) outside these roots is rejected with +`{"ok": false, "error": "invalid_path"}`. Symlink traversal is prevented by calling +`Path.resolve()` before comparison. + +### Path Traversal Mitigations + +| Attack Vector | Mitigation | +|---|---| +| `../../etc/passwd` in `file_path` | `Path.resolve()` + allowlist check | +| `../../etc` in `copy_file` `destination` | Same: both source and destination validated | +| `../model.osm` in `seed_file` (OSW) | Flattened to `basename` before staging into run dir | +| Symlink escape from `/runs` | `resolve()` follows symlinks before allowlist check | + +### What Is Not Protected + +- **Denial of service** via large file reads: `read_file` defaults to 50 KB (`max_bytes=50_000`) + but callers can override `max_bytes`. No upper-bound cap is enforced — consider adding one + if exposing this server to untrusted clients. +- **EnergyPlus subprocess**: The simulation runner (`run_simulation`, `run_osw`) invokes + `openstudio run` as a subprocess. The OSM/OSW content is caller-controlled; a malicious model + could cause unexpected EnergyPlus behavior. The container boundary is the primary mitigation. + +--- + +## Container Isolation + +The server is designed to run inside a Docker container with explicit volume mounts: + +``` +docker run --rm \ + -v "/path/to/models:/inputs" \ + -v "/path/to/outputs:/runs" \ + openstudio-mcp:latest +``` + +- The host filesystem is **not mounted** except for the two explicit volumes. +- By default, the server performs no outbound network calls; OpenStudio/EnergyPlus + are fully offline. **Exception:** when `TRACELOOP_BASE_URL` is set, the server + exports traces to that OTLP endpoint. +- The server process runs as the user defined by the container runtime. The repo + Dockerfile does not set a `USER` instruction — it runs as root by default. + Production deployments should add a non-root `USER` in a derived image. + +--- + +## Stdout / MCP Transport Integrity + +OpenStudio's SWIG bindings emit log warnings to C stdout. This would corrupt the JSON-RPC +transport (MCP communicates over stdio). Two mitigations are applied at startup in `server.py`: + +1. `silence_openstudio_stdout_logger()` — sets OpenStudio's standard-out logger to `Fatal` + level, suppressing operational warnings. +2. `redirect_c_stdout_to_stderr()` — permanently redirects C-level stdout (fd 1) to stderr, + with Python's `sys.stdout` on a private pipe to the MCP client. This is a backstop for + any C-extension output that bypasses the logger. + +These mitigations prevent log injection into the MCP JSON-RPC stream. + +--- + +## Reporting a Vulnerability + +Please **do not** open a public GitHub issue for security vulnerabilities. + +Email the maintainers directly or use GitHub's +[private security advisory](https://github.com/settings/security-advisories) feature. Include: + +- A description of the vulnerability and its impact +- Steps to reproduce (minimal repro preferred) +- Affected versions or commits + +We aim to acknowledge reports within 72 hours and provide a fix or mitigation within 14 days +for confirmed issues. + +--- + +## Known Limitations / Out of Scope + +- **Authentication / authorization**: The MCP server has no built-in auth. Access control is + the responsibility of the MCP client and the host environment. +- **EnergyPlus model content**: The server executes whatever EnergyPlus model the caller + provides. Malicious model content is an EnergyPlus concern, not an MCP server concern. +- **Multi-tenancy**: The server holds a single shared in-memory model. It is not designed for + simultaneous untrusted multi-user access. diff --git a/docker/Dockerfile b/docker/Dockerfile index 1a4ae9d..3e600f1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -43,8 +43,13 @@ COPY pyproject.toml /repo/pyproject.toml COPY mcp_server /repo/mcp_server COPY docker /repo/docker +# TELEMETRY=1 installs traceloop-sdk + opentelemetry instrumentation for MCP. +# Default off to keep the image lean. Set to 1 for the tracing variant: +# docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile . +ARG TELEMETRY=0 RUN pip install --no-cache-dir -U pip \ - && pip install --no-cache-dir -e ".[dev]" + && pip install --no-cache-dir -e ".[dev]" \ + && if [ "$TELEMETRY" = "1" ]; then pip install --no-cache-dir -e ".[telemetry]"; fi # (Optional) If you want the container to include any other repo files too: # COPY . /repo diff --git a/docker/docker-compose.tracing.yml b/docker/docker-compose.tracing.yml new file mode 100644 index 0000000..d75f7e2 --- /dev/null +++ b/docker/docker-compose.tracing.yml @@ -0,0 +1,91 @@ +# OpenLLMetry / Traceloop tracing stack for openstudio-mcp +# +# Quick start: +# # 1. Build the tracing-enabled MCP image (adds traceloop-sdk): +# docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile . +# +# # 2. Start Jaeger: +# docker compose -f docker/docker-compose.tracing.yml up -d +# open http://localhost:16686 # Jaeger UI — traces appear here +# +# # 3. Configure your MCP client to use the tracing image on the shared network. +# Add these flags to your client's docker run command: +# +# -e TRACELOOP_BASE_URL=http://jaeger:4318 +# --network openstudio-mcp-tracing +# (and use openstudio-mcp:tracing instead of openstudio-mcp:dev) +# +# Example — Claude Code .mcp.json with tracing enabled: +# +# { +# "mcpServers": { +# "openstudio-mcp": { +# "command": "docker", +# "args": [ +# "run", "--rm", "-i", +# "-v", "/abs/path/inputs:/inputs", +# "-v", "/abs/path/runs:/runs", +# "--network", "openstudio-mcp-tracing", +# "-e", "OPENSTUDIO_MCP_MODE=prod", +# "-e", "TRACELOOP_BASE_URL=http://jaeger:4318", +# "-e", "OTEL_SERVICE_NAME=openstudio-mcp", +# "-e", "TRACELOOP_TRACE_CONTENT=true", +# "openstudio-mcp:tracing", "openstudio-mcp" +# ] +# } +# } +# } +# +# Environment variables understood by the MCP server (see mcp_server/telemetry.py): +# TRACELOOP_BASE_URL OTLP HTTP endpoint (required to enable telemetry) +# TRACELOOP_API_KEY API key for Traceloop cloud (omit for local Jaeger) +# OTEL_SERVICE_NAME Service name on spans (default: openstudio-mcp) +# OTEL_EXPORT_BATCH "false" for sync export in dev (default: batch) +# TRACELOOP_TRACE_CONTENT "false" to omit tool args from spans (privacy) + +services: + jaeger: + image: jaegertracing/jaeger:2.5.0 + container_name: openstudio-mcp-jaeger + ports: + - "16686:16686" # Jaeger UI + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver (used by traceloop-sdk) + environment: + - SPAN_STORAGE_TYPE=memory + networks: + - tracing + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://localhost:16686/"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 10s + + # Optional: OpenTelemetry Collector as a middle layer. + # Useful if you want to fan-out to multiple backends (Jaeger + Prometheus + Loki). + # Comment out and point TRACELOOP_BASE_URL directly to jaeger:4318 for simplest setup. + otel-collector: + image: otel/opentelemetry-collector-contrib:0.120.0 + container_name: openstudio-mcp-otelcol + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./otel-collector-config.yaml:/etc/otel/config.yaml:ro + ports: + - "4319:4318" # OTLP HTTP (external port offset to avoid conflict with jaeger) + - "4320:4317" # OTLP gRPC (external port offset) + - "8888:8888" # Collector metrics (Prometheus scrape endpoint) + networks: + - tracing + depends_on: + jaeger: + condition: service_healthy + restart: unless-stopped + profiles: + - collector # only start with: docker compose --profile collector up + +networks: + tracing: + name: openstudio-mcp-tracing + driver: bridge diff --git a/docker/otel-collector-config.yaml b/docker/otel-collector-config.yaml new file mode 100644 index 0000000..4520179 --- /dev/null +++ b/docker/otel-collector-config.yaml @@ -0,0 +1,59 @@ +# OpenTelemetry Collector config for openstudio-mcp tracing stack. +# Used only when running: docker compose --profile collector up +# For simple setups, skip this and point TRACELOOP_BASE_URL directly to jaeger:4318. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + batch: + timeout: 1s + send_batch_size: 1024 + memory_limiter: + check_interval: 1s + limit_mib: 256 + spike_limit_mib: 64 + # Add service name tag from resource attributes + resource: + attributes: + - key: service.namespace + value: openstudio-mcp + action: insert + +exporters: + otlp/jaeger: + endpoint: jaeger:4317 + tls: + insecure: true + + # Uncomment to export to Traceloop cloud instead of / in addition to Jaeger: + # otlphttp/traceloop: + # endpoint: https://api.traceloop.com + # headers: + # Authorization: "Bearer ${TRACELOOP_API_KEY}" + + # Prometheus metrics from the collector itself (scrape at :8888/metrics) + prometheus: + endpoint: 0.0.0.0:8888 + + # Debug: print spans to collector stdout (useful for development) + debug: + verbosity: basic + sampling_initial: 5 + sampling_thereafter: 200 + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, resource, batch] + exporters: [otlp/jaeger] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [prometheus] diff --git a/docs/clients/claude-code.md b/docs/clients/claude-code.md new file mode 100644 index 0000000..2809f33 --- /dev/null +++ b/docs/clients/claude-code.md @@ -0,0 +1,131 @@ +# Claude Code Setup + +> **Last verified:** April 2026 · Claude Code 1.x · [Docs](https://docs.anthropic.com/en/docs/claude-code/mcp) + +Claude Code is the **optimal client** for openstudio-mcp. Its ToolSearch feature automatically defers all 142 tools and retrieves only the 3-5 most relevant ones per turn — this eliminates context bloat and keeps accuracy high even on long multi-step workflows. No manual tool filtering is required. + +--- + +## Prerequisites + +- **Docker Desktop** running ([download](https://www.docker.com/products/docker-desktop/)) +- **Claude Code** installed: `npm install -g @anthropic-ai/claude-code` +- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` + +--- + +## Configuration + +Create `.mcp.json` in your project root (or any directory you run `claude` from). A template is provided at `.mcp.json.example` in the openstudio-mcp repo. Copy it and fill in your absolute paths: + +```bash +cp .mcp.json.example .mcp.json +# Edit .mcp.json and replace /ABSOLUTE/PATH/TO/... placeholders +``` + +`.mcp.json` contains machine-specific absolute paths so it is gitignored by default. Share the `.mcp.json.example` template with your team instead. + +```json +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/your/inputs:/inputs", + "-v", "/absolute/path/to/your/runs:/runs", + "-v", "/absolute/path/to/openstudio-mcp/.claude/skills:/skills:ro", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} +``` + +> **Tip:** Include the `.claude/skills` mount. Claude Code's ToolSearch indexes tool descriptions at connection time — the skill guides improve keyword matching for `get_skill()` and `list_skills()` calls. + +**Alternative: pass config path explicitly** + +```bash +claude --mcp-config /path/to/mcp.json +``` + +--- + +## Verification + +```bash +# Confirm .mcp.json is valid and openstudio-mcp is registered +# (run from the project directory containing .mcp.json) +claude mcp add openstudio-mcp --scope project docker -- run --rm -i \ + -v "/absolute/path/to/inputs:/inputs" \ + -v "/absolute/path/to/runs:/runs" \ + -e OPENSTUDIO_MCP_MODE=prod openstudio-mcp:dev openstudio-mcp +# → prints "MCP server openstudio-mcp already exists in .mcp.json" if it's registered +``` + +> **Note:** `claude mcp list` shows only user-scope servers. Project-scope `.mcp.json` servers load when you start an interactive `claude` session from that directory — they won't appear in `mcp list`. + +Start a session and test: + +```bash +# Start Claude Code in the project directory +cd /path/to/your/project +claude + +# At the prompt: +> list_skills +``` + +A successful response shows available skill categories. Claude Code will use ToolSearch to find and load only the relevant tool schemas for each request. + +--- + +## How ToolSearch Works + +When the total tool schema size exceeds 10% of the model's context window, Claude Code automatically defers all tools and exposes only a `mcp__openstudio-mcp__search_tools` search endpoint. The workflow becomes: + +1. Your prompt arrives +2. Claude Code searches for relevant tools by keyword +3. 3–5 matching tool schemas are loaded into context +4. The tool is called with correct parameters +5. Repeat as needed + +This means openstudio-mcp's 142 tools behave as if there were only 5 at any given moment. **ToolSearch indexes at Docker image build time** — always rebuild the image after adding new tools (`docker build`). + +### Why ToolSearch Accuracy Depends on Descriptions + +ToolSearch uses BM25/regex matching on tool names and descriptions. Vague prompts ("add HVAC") depend on description keywords to route correctly. The skills system supplements this — calling `list_skills()` and `get_skill("add-hvac")` gives Claude a step-by-step guide that bypasses tool ambiguity entirely. + +--- + +## First Prompts + +``` +Simple: "Create an example model and describe its thermal zones" +Medium: "Follow the new-building skill to create a 5-story office in Boston" +Advanced: "Load /inputs/baseline.osm, run a simulation, and show me the EUI breakdown by end use" +``` + +--- + +## Context & Performance Notes + +ToolSearch reduces per-turn schema overhead from ~15K tokens (all 142 loaded) to ~1K tokens (3-5 tools loaded). This is the primary reason Claude Code is the recommended client — see [Token Context & Performance](./token-context-performance.md) for numbers. + +Observed benchmark (3-model sweep, 180 tests, zero retries): +- Sonnet: 94.4% pass rate, avg 1.9 ToolSearch calls/test +- Opus: 94.4% pass rate, avg 2.0 ToolSearch calls/test +- Haiku: 88.9% pass rate (does not use ToolSearch; reasons directly from tool list) + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|-------------|-----| +| ToolSearch returns "No matching tools found" | Image not rebuilt after tool additions | `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` | +| Tools work in one session, not another | `.mcp.json` not in working directory | Check `pwd` matches where `.mcp.json` lives | +| Claude generates Python scripts instead of using tools | ToolSearch not finding MCP tools | Rebuild image; check descriptions include relevant keywords | +| Long workflows lose model state | In-memory model cleared between `claude` sessions | Save model to `/runs/` at end of each session with `save_osm_model` | diff --git a/docs/clients/claude-desktop.md b/docs/clients/claude-desktop.md new file mode 100644 index 0000000..de104f0 --- /dev/null +++ b/docs/clients/claude-desktop.md @@ -0,0 +1,108 @@ +# Claude Desktop Setup + +> **Last verified:** April 2026 · Claude Desktop 0.10+ · [Docs](https://support.anthropic.com/en/articles/9517730-getting-started-with-claude-desktop) + +Claude Desktop is the recommended starting point for openstudio-mcp. It has a GUI, supports all 142 tools, and handles the full skill workflow. The main limitation is that all tool schemas load into context upfront — above ~100 tools, you may notice the model spending more tokens on tool routing before the first useful response. + +--- + +## Prerequisites + +- **Docker Desktop** running ([download](https://www.docker.com/products/docker-desktop/)) +- **Claude Desktop** installed ([download](https://claude.ai/download)) +- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` + +--- + +## Configuration + +Open the Claude Desktop config file: + +| OS | Path | +|----|------| +| macOS | `~/Library/Application Support/Claude/claude_desktop_config.json` | +| Windows | `%APPDATA%\Claude\claude_desktop_config.json` | + +Add the `openstudio-mcp` entry to the `mcpServers` block. Replace the placeholder paths with **absolute paths** on your machine: + +```json +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/your/inputs:/inputs", + "-v", "/absolute/path/to/your/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} +``` + +**Optional: include skill guides** (enables `list_skills()` / `get_skill()` tools): + +```json +"-v", "/absolute/path/to/openstudio-mcp/.claude/skills:/skills:ro", +``` + +--- + +## Verification + +1. **Restart Claude Desktop** after saving the config +2. Look for the **hammer icon (🔨)** in the chat input bar — it appears when at least one MCP server is connected +3. Click the hammer icon to see all available tools listed under "openstudio-mcp" +4. Send this test prompt: + + > *"Call list_skills and tell me what skill categories are available."* + + A successful response lists the skill categories (geometry, HVAC, simulation, etc.). If you see a generic response or an error, check the troubleshooting section below. + +--- + +## First Prompts + +Try these in order of complexity: + +``` +Simple: "Create an example model and tell me about it" +Medium: "Create a small office building with ASHRAE System 3 and show me the HVAC components" +Advanced: "Load my model at /inputs/MyBuilding.osm, apply the 90.1-2019 template, and run a simulation" +``` + +--- + +## File Access Pattern + +Place your `.osm` models and weather files in the folder you mapped to `/inputs`. Claude Desktop's built-in file upload puts files into an Analysis sandbox that cannot reach MCP tools — always use the `/inputs` mount instead. + +```bash +# Copy your model to the inputs folder before referencing it in chat +cp MyBuilding.osm /absolute/path/to/your/inputs/ + +# Then reference it in the prompt +"Analyze the building at /inputs/MyBuilding.osm" +``` + +--- + +## Context & Performance Notes + +Claude Desktop loads all 142 tool schemas into context on the first tool call. This costs approximately **15K tokens** of your context budget upfront — see [Token Context & Performance](./token-context-performance.md) for a full breakdown. + +Practical effect: initial responses in a new conversation may include brief tool-selection overhead. Long conversations (15+ turns with heavy tool use) may exhaust context on complex models. If this happens, start a fresh conversation and reference `/runs/` outputs by path. + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|-------------|-----| +| No hammer icon | Docker not running, or config JSON is invalid | Validate JSON at jsonlint.com; check `docker ps` | +| Hammer icon but no openstudio-mcp tools | Image not found | Run `docker images` to confirm `openstudio-mcp:dev` exists | +| `Error: volume path is not absolute` | Relative paths in config | Replace `./runs` with the full absolute path | +| Model loaded but changes lost | `/runs` not mounted | Confirm the `-v` run path mount is in your config | +| Upload file, tools not used | File went to Analysis sandbox | Move file to `/inputs` folder and reference by path instead | diff --git a/docs/clients/cursor.md b/docs/clients/cursor.md new file mode 100644 index 0000000..cfdb437 --- /dev/null +++ b/docs/clients/cursor.md @@ -0,0 +1,31 @@ +# Cursor — Not Compatible + +Cursor has a **40-tool hard cap** for MCP servers. openstudio-mcp provides 142 tools. Cursor will silently truncate the tool list to the first 40 returned by `tools/list`, which means the majority of the BEM workflow — including HVAC configuration, geometry editing, results extraction, and measure authoring — will be inaccessible. + +There is no supported workaround within Cursor itself (tool filtering is not user-configurable at the MCP level in current versions). + +--- + +## What Happens If You Try + +Adding openstudio-mcp to `.cursor/mcp.json` will technically connect the server. Cursor will load the first 40 tools alphabetically from `tools/list`. Prompts that happen to use those 40 tools will work; anything requiring tools beyond position 40 will fail silently (the model will either hallucinate a response or say the operation isn't possible). + +--- + +## Recommended Alternatives + +| Client | Why it's better for this use case | +|--------|----------------------------------| +| **Claude Code** | Best option: ToolSearch handles 142 tools with auto-deferral | +| **Windsurf** | 100-tool limit; workable with manual tool selection | +| **VS Code Copilot** | 128-tool limit; close to full coverage with minor tool disabling | +| **Gemini CLI** | 100 soft limit; `includeTools` filter makes it manageable | +| **Claude Desktop** | Full 142 tools; good for interactive exploration | + +--- + +## If You Must Use Cursor + +If your workflow is confined to a narrow subset of tools (e.g., only model inspection and simulation result reading), you can curate a 40-tool subset by running a local wrapper that filters the tool list before serving it to Cursor. This is an advanced workaround and not officially supported. + +Track Cursor's MCP roadmap for changes to the tool cap: [Cursor MCP docs](https://docs.cursor.com/context/model-context-protocol). diff --git a/docs/clients/gemini-cli.md b/docs/clients/gemini-cli.md new file mode 100644 index 0000000..d368596 --- /dev/null +++ b/docs/clients/gemini-cli.md @@ -0,0 +1,124 @@ +# Gemini CLI Setup + +> **Last verified:** April 2026 · Gemini CLI 0.1.x · [Docs](https://github.com/google-gemini/gemini-cli) + +Gemini CLI is a terminal-based AI agent with a **1M token context window** — the largest of any supported client. This makes it well-suited for long BEM workflows and large file analysis. The soft 100-tool limit (512 via API) requires using `includeTools` to avoid degraded performance when all 142 tools are registered. + +--- + +## Prerequisites + +- **Docker Desktop** running +- **Gemini CLI** installed: + ```bash + npm install -g @google/gemini-cli + ``` + > **Note:** The Homebrew formula (`brew install gemini-cli`) has a known dependency issue with `@google/gemini-cli-core`. Use npm. +- **Google account** (free tier: 60 req/min, 1,000 req/day) or Gemini API key +- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` + +--- + +## Configuration + +Add openstudio-mcp to `~/.gemini/settings.json`. Create the file if it doesn't exist: + +```json +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/your/inputs:/inputs", + "-v", "/absolute/path/to/your/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} +``` + +**Alternative: project-scoped config** via `GEMINI.md` in your project directory. Add a code block with the server config — Gemini CLI reads `GEMINI.md` as project context on startup. + +--- + +## Managing the Tool Limit + +Gemini CLI has a soft limit of 100 tools in interactive mode (512 via API). With 142 tools registered, performance may degrade. Use the `includeTools` filter to expose only the tools you need: + +```json +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/inputs:/inputs", + "-v", "/absolute/path/to/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ], + "includeTools": [ + "list_skills", "get_skill", + "create_new_building", "create_bar_building", "create_typical_building", + "load_osm_model", "save_osm_model", "get_model_summary", "get_building_info", + "list_thermal_zones", "list_spaces", "list_air_loops", "list_plant_loops", + "add_baseline_system", "list_baseline_systems", + "run_simulation", "get_run_status", + "extract_summary_metrics", "extract_end_use_breakdown", "compare_runs", + "validate_model", "change_building_location", + "list_surfaces", "replace_window_constructions", + "create_measure", "test_measure", "apply_measure", + "generate_results_report", "recommend_tools" + ] + } + } +} +``` + +Extend the `includeTools` list as needed. See [index.md](./index.md) for the full tool list organized by workflow. + +--- + +## Verification + +```bash +# Confirm openstudio-mcp is registered (no API call needed) +gemini mcp list +# → Should show "✓ openstudio-mcp: docker run ... (stdio) - Connected" + +# Start Gemini CLI and test interactively +gemini +> Use openstudio-mcp to list the available skills +``` + +--- + +## First Prompts + +``` +"Create a medium office building in Chicago using openstudio-mcp and run a simulation" +"Load the model at /inputs/baseline.osm and compare the envelope constructions" +"Write a Ruby measure to set all lights to 8 W/m2, test it, and apply it" +``` + +--- + +## Context & Performance Notes + +Gemini 2.0/2.5 models have a 1M token context window, so tool schema overhead (~15K tokens for all 142) is a small fraction of total capacity. Long BEM workflows with many intermediate results are well-suited to Gemini CLI's large context. + +However, accuracy is a function of tool count presented per turn, not total context size. Using `includeTools` to present 30–40 tools at a time keeps the model focused. See [Token Context & Performance](./token-context-performance.md). + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|-------------|-----| +| Tools not found | Config file location wrong | Confirm `~/.gemini/settings.json` (not `.gemini/config.json`) | +| Tool count exceeds limit warning | >100 tools registered | Add `includeTools` filter to config | +| Slow first response | All 142 schemas loading | Add `includeTools` to reduce initial schema payload | +| Free tier rate limit hit | >60 req/min | Upgrade to Gemini API key or reduce tool calls per workflow | diff --git a/docs/clients/index.md b/docs/clients/index.md new file mode 100644 index 0000000..0a31fe1 --- /dev/null +++ b/docs/clients/index.md @@ -0,0 +1,66 @@ +# MCP Client Setup Guide + +This section covers how to connect openstudio-mcp to each supported AI client, what to expect from the 142-tool surface in each environment, and how to evaluate the performance impact on your context window. + +--- + +## Client Compatibility + +| Client | Tool Limit | Discovery | Status | Notes | +|--------|-----------|-----------|--------|-------| +| **Claude Code** | Unlimited | ToolSearch (auto-defer) | ✅ Best | Defers all 142 tools; retrieves 3-5 per turn by keyword | +| **Claude Desktop** | ~100 practical | None (all in context) | ✅ Full | All tools load upfront; degradation above ~100 tools | +| **VS Code Copilot** | 128 hard | None | ✅ Full | Requires VS Code 1.99+ with MCP support enabled | +| **Windsurf** | 100 hard | Per-tool toggle | ⚠️ Partial | Must disable 42+ tools via UI; not plug-and-play | +| **Gemini CLI** | 100 soft / 512 API | includeTools/excludeTools | ⚠️ Partial | Use `includeTools` to scope to a working subset | +| **Cursor** | 40 hard | None | ❌ Incompatible | 40-tool hard cap; use Windsurf or Claude Code instead | + +**Recommendation:** Claude Code is the optimal client for openstudio-mcp. It is the only client with dynamic tool discovery that handles 142 tools efficiently and without manual configuration. + +--- + +## Canonical Docker Server Config + +Every client needs a block that tells it how to launch the server. The **core Docker command** is the same in all cases — only the key names differ by client. + +```json +{ + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/ABSOLUTE/PATH/TO/inputs:/inputs", + "-v", "/ABSOLUTE/PATH/TO/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] +} +``` + +**Required substitutions:** +- `/ABSOLUTE/PATH/TO/inputs` — folder containing your `.osm` and weather files +- `/ABSOLUTE/PATH/TO/runs` — folder where simulation outputs will be written + +> **Use absolute paths.** Many clients run the command from an unpredictable working directory, so relative paths like `./runs` will silently fail or point to the wrong location. + +**Optional: mount skill guides for `get_skill()` / `list_skills()` access** + +```json +"-v", "/ABSOLUTE/PATH/TO/openstudio-mcp/.claude/skills:/skills:ro", +``` + +See each client guide for how to embed this block in the client's specific config format. + +--- + +## Guide Index + +- [Claude Desktop](./claude-desktop.md) — Recommended starting point; GUI client with full tool support +- [Claude Code](./claude-code.md) — Best for power users; ToolSearch handles 142 tools efficiently +- [VS Code Copilot](./vs-code-copilot.md) — VS Code 1.99+; 128-tool limit, workspace-scoped config +- [Windsurf](./windsurf.md) — Cascade AI; 100-tool limit requires manual tool selection +- [Gemini CLI](./gemini-cli.md) — Terminal-based; 1M token context; use `includeTools` to subset +- [Cursor](./cursor.md) — Not compatible; 40-tool hard cap; alternatives listed + +## Reference + +- [Token Context & Performance Impact](./token-context-performance.md) — How the 142-tool surface affects each client's context budget diff --git a/docs/clients/token-context-performance.md b/docs/clients/token-context-performance.md new file mode 100644 index 0000000..a652f1f --- /dev/null +++ b/docs/clients/token-context-performance.md @@ -0,0 +1,324 @@ +# Token Context & Performance Impact + +This document covers the measurable cost of connecting openstudio-mcp to an LLM client: how many tokens the 142 tools consume, how different clients handle that load, and what strategies reduce the overhead. + +Schema measurements are live — extracted directly from the running MCP server via `tools/list` JSON-RPC (see [Measuring Schema Size](#measuring-schema-size) below). LLM accuracy figures are from the three-model benchmark sweep (180 tests, zero retries; see [`docs/knowledge/tool-discovery-and-llm-testing.md`](../knowledge/tool-discovery-and-llm-testing.md)). + +--- + +## What Adds to Context + +When an MCP client connects to openstudio-mcp, the following items may enter the model's context window: + +| Item | Size | When loaded | +|------|------|-------------| +| Tool schemas (all 142) — full JSON | ~117K chars / **~29K tokens** | On first tool call or session start | +| Server instructions (`NEVER`/`ALWAYS` rules) | ~550 tokens | Once per session | +| Skill guide content (`get_skill()` output) | 1–4K tokens per guide | When explicitly requested | +| MCP prompts / resources | ~0.5K tokens each | When explicitly invoked | +| Conversation history | Grows per turn | Accumulates throughout session | + +**Total fixed overhead on first tool call: ~30–32K tokens.** + +For comparison, a full simulation run (create building → simulate → extract results → compare) takes approximately **15K total tokens** in conversation — about half the schema overhead alone. + +> **Note on schema token counting:** Early measurements reported ~61K chars / ~15K tokens. That figure counted **names + descriptions only** and omitted the JSON input schemas (parameter names, types, enums, defaults). The full JSON payload an LLM actually receives is 117K chars / ~29K tokens. Both figures are accurate for their stated scope; use the full-JSON number for context budget planning. + +--- + +## Schema Size History + +The schema size has been measured at multiple points in the project: + +| Date | Tools | Schema Chars (full JSON) | Est. Tokens | Change | +|------|-------|--------------------------|-------------|--------| +| Feb 2026 | 62 | ~54K | ~13.5K | Initial | +| Mar 2026 | 126 | ~175K | ~44K | +64 tools | +| Mar 2026 (post-compress) | 127 | ~108K | ~27K | 30% description compression | +| Apr 2026 | 142 | **117K** | **~29K** | +15 tools; measured live | + +Breakdown of the Apr 2026 117K chars: +- Names: ~2.7K chars (~673 tokens) +- Descriptions: ~58.7K chars (~14.7K tokens) +- Input schemas (params/types/enums): ~36.1K chars (~9K tokens) +- JSON structure overhead: ~19.5K chars (~4.9K tokens) + +Key lesson: description compression reduced schema size but harmed ToolSearch accuracy (compressed descriptions had fewer keywords for BM25 matching). The current schema is a deliberate balance between size and discoverability. + +--- + +## Per-Client Context Budget + +### Context windows + +Measured April 2026 from live `tools/list` response (117,047 chars / 142 tools): + +| Client / Model | Context Window | Schema Tokens | Overhead % | Notes | +|----------------|---------------|---------------|-----------|-------| +| Claude Code (Sonnet 4.7) | 200K tokens | **~1K tokens*** | **~0.5%*** | ToolSearch defers all; 3–5 tools/turn | +| Claude Desktop (Sonnet 4.7) | 200K tokens | ~29K tokens | ~14.6% | All 142 schemas in context | +| VS Code Copilot (GPT-4.1) | 128K tokens | ~28K tokens† | ~22.0%† | 128-tool cap enforced | +| VS Code Copilot (Claude Sonnet 4.7) | 200K tokens | ~28K tokens† | ~14.1% | 128-tool cap enforced | +| VS Code Copilot (Gemini 2.5 Flash) | 1M tokens | ~28K tokens† | ~2.8% | 128-tool cap enforced | +| Windsurf / 80-tool curated | 200K tokens | ~16.5K tokens | ~8.2% | Manual curation required | +| Gemini CLI (Gemini 2.5 Pro) | 1M tokens | ~29K tokens | ~2.9% | Use `includeTools` to reduce | + +\* Claude Code ToolSearch defers all tools; only 3–5 schemas (~820–1,030 tokens at ~205 tokens/tool avg) load per turn. +† VS Code Copilot enforces a 128-tool cap; 14 smallest tools excluded, saving ~1.1K tokens. The 14 excluded tools (based on schema size) are: `get_run_period`, `get_versions`, `get_server_status`, `get_weather_info`, `match_surfaces`, `get_simulation_control`, `cancel_run`, `enable_ideal_air_loads`, `set_lifecycle_cost_params`, `extract_hvac_sizing`, `extract_zone_summary`, `get_run_artifacts`, `extract_envelope_summary`, `get_zone_hvac_details`. + +### When Context Pressure Becomes a Problem + +Claude Code triggers ToolSearch automatically when schemas exceed 10% of context. For other clients, the model itself must manage context. Signs of context pressure: + +- Model begins truncating or paraphrasing earlier in the conversation +- Tool calls start failing to pass correct parameter values (model "forgets" schema details) +- Model stops using tools entirely and falls back to explaining what it would do +- Long simulation chains: after 20+ turns with large intermediate results, accuracy drops + +**Practical guideline:** At ~29K tokens of schema overhead, Claude Desktop and VS Code Copilot (GPT-4.1 on 128K context) already spend ~15–22% of their budget before any conversation. Plan for 10–15 high-quality turns on complex workflows. Start a new conversation and reference `/runs/` output paths to continue. + +--- + +## How Clients Handle 142 Tools + +### Claude Code: ToolSearch (Deferred Loading) + +ToolSearch indexes all 142 tools at image build time using BM25/regex on names and descriptions. When schemas exceed 10% of context, tools are deferred. Per turn: +- ~3–5 tool schemas load into context (~1K tokens, ~97% reduction vs. 29K) +- Schema overhead per turn: ~1,030 tokens (5 tools × ~205 tokens/tool avg) +- Works because the ToolSearch index holds the full schema catalog outside context + +**Benchmark result:** 94.4% pass rate (Sonnet/Opus, 180 tests, zero retries). ToolSearch calls: avg 1.9/test. + +### Claude Desktop / VS Code Copilot: Brute-Force Load + +All enabled tool schemas load into context on the first tool call. No deferred loading, no filtering. Performance effect: +- First response in a new session has ~14–22% context already consumed (vs. ~0.5% for Claude Code) +- Accuracy stays high for shorter sessions (5–10 turns) +- Long sessions may show degradation as conversation history + schema + results approach the context limit +- VS Code Copilot with GPT-4.1 (128K window) is most constrained: ~22% of context consumed before the first user message + +### Windsurf: Per-Tool Toggle (Manual Curation) + +Cascade enforces 100 tools hard. User selects which tools are enabled. With a curated 80-tool set (~16.5K tokens), the overhead is ~43% lower than loading all 142. Manual curation adds setup friction but produces the most focused tool surface. + +### Gemini CLI: Large Context Buffer + +1M token context window means schema overhead (~29K tokens = ~2.9%) is low even at full load. The practical concern is accuracy per turn, not context exhaustion — presenting all 142 tools at once can confuse the model. Use `includeTools` to keep per-turn tool count under ~40. + +--- + +## Strategies to Reduce Context Overhead + +### 1. Use `list_skills` + `get_skill` First (Universal) + +Instead of letting the model search all 142 tools, ask it to follow a skill guide. The guide gives explicit tool names and order, bypassing tool discovery entirely: + +``` +"Use the new-building skill to create a medium office building in Boston." +``` + +vs. + +``` +"Create a medium office building in Boston." ← model must select from 142 tools +``` + +Both work, but the first produces fewer ToolSearch calls and more predictable tool sequences. + +### 2. Enable `defer_loading` (OpenAI-Compatible Clients) + +For clients that support the OpenAI `defer_loading` flag, set it on the server config. This exposes only a search tool by default and loads schemas on demand. Reduces first-call overhead by ~85%. + +### 3. Use `includeTools` / Per-Tool Toggles (Windsurf, Gemini CLI) + +Configure a focused tool subset matching your current workflow phase. A 30-tool simulation workflow subset (~4–5K tokens) is well within any client's context budget and produces cleaner responses than exposing all 142. + +### 4. Reference `/runs/` Paths, Not Inline Results + +Instead of asking the model to read and summarize large simulation outputs inline, reference them by path: + +``` +"The simulation output is at /runs/run-20260415/. Extract the EUI." +``` + +This lets `extract_summary_metrics` and `extract_end_use_breakdown` do targeted extraction rather than streaming the full HTML report into context. + +### 5. Split Long Workflows Across Conversations (Claude Desktop) + +Save model state at key checkpoints with `save_osm_model`. Start a fresh conversation for the next phase. Reference saved files by path. This resets conversation history overhead while preserving all model changes. + +--- + +## Tool Call Latency + +Measured April 2026 via OpenLLMetry traces (Jaeger). Environment: Apple M3 Max, Docker amd64 emulation, openstudio-mcp:tracing image. + +### Cold Start (first tool call per Docker container) + +Includes: Docker container launch + Python server init + first `import openstudio`. + +| Phase | Latency | +|-------|---------| +| Container start → MCP `initialize` response | ~1.8–2.1s | +| First `import openstudio` (cold) | included above | +| tools/list (full 142 schema, 117K chars) | ~1s (bundled at init) | + +### Warm Tool Call Latency (in-session, OpenStudio already loaded) + +| Tool | Avg latency | Notes | +|------|-------------|-------| +| `get_server_status` | ~3ms | No OpenStudio ops | +| `list_skills` | ~1ms | File read only | +| `validate_model` | ~3ms | Model checks | +| `list_spaces` / `list_thermal_zones` / `list_air_loops` | 1–5ms | In-memory iteration | +| `list_weather_files` | ~12ms | EPW file scan | +| `get_building_info` | ~5–15ms | Model introspection | +| `get_model_summary` | ~8–23ms | Full object count | +| `get_versions` | ~150–220ms | OpenStudio SDK call | +| `create_example_osm` | ~200–215ms | Model build from scratch | + +> **Note:** These latencies reflect the MCP server's processing time. Client-visible latency adds the LLM inference time on top (typically 1–10s for tool call generation + JSON parse). The server itself is fast; bottlenecks in multi-step workflows are almost always LLM inference, not tool execution. + +### Running Traces Yourself + +```bash +# 1. Start Jaeger +docker compose -f docker/docker-compose.tracing.yml up -d + +# 2. Build tracing image +docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile . + +# 3. Connect your client with tracing env vars (see compose file header) +# Traces appear at http://localhost:16686 +``` + +--- + +## LLM Accuracy vs. Tool Count + +From internal benchmarks and published research: + +| Tools Presented Per Turn | Accuracy | Source | +|--------------------------|----------|--------| +| 5–7 | ~92% | Jenova.ai | +| 10–15 | sweet spot | Multiple | +| 3–5 (ToolSearch output) | 94.4% | openstudio-mcp sweep | +| 40+ (all visible, no deferral) | Degraded | Allen Chan / IBM | +| 100+ (no retrieval) | ~13–14% | RAG-MCP | +| 100+ (with semantic retrieval) | ~43% | RAG-MCP | + +The openstudio-mcp benchmark shows 94.4% at 142 tools **because ToolSearch reduces the per-turn visible set to 3–5**. Without ToolSearch (e.g., Claude Desktop), the effective tool count visible to the model per turn is still all 142, but Claude's reasoning capability keeps accuracy high for sessions under ~20 turns. + +--- + +## Local LLM Benchmark: llama3.2:3b vs gemma3:4b + +Measured April 2026 on Apple M3 Max (14 CPU, 36 GB RAM, no GPU) via Ollama v0.20.7 + [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 0.4.11. Tasks: **GSM8K** (math / chain-of-thought reasoning) and **IFEval** (instruction following). 100 samples per task, zero-shot. + +| Model | GSM8K flexible-extract | IFEval prompt loose | IFEval inst loose | Runtime | Disk | +|---|---|---|---|---|---| +| llama3.2:3b (Meta, US) | **0.67** | 0.630 | 0.767 | 9m35s | 2.0 GB | +| gemma3:4b (Google DeepMind, US) | 0.55 | **0.750** | **0.828** | 16m39s | 3.3 GB | + +**Takeaways:** +- `llama3.2:3b` wins on math/reasoning (0.67 vs 0.55 GSM8K). +- `gemma3:4b` wins on instruction following (0.750 vs 0.630 IFEval prompt-level loose) — more relevant for agentic tool use. +- `gemma3:4b` **does not support native tool calling in Ollama** — the `/api/chat` endpoint returns HTTP 400 for any `tools` field. This makes it unsuitable for measuring MCP schema overhead or running tool-calling benchmarks with Ollama. +- **`llama3.2:3b` is the recommended model for CI-based MCP overhead benchmarks**: native tool calling, 2.0 GB fits comfortably on GitHub Actions `ubuntu-latest` (16 GB RAM), and is from a US-based company. + +### How to Reproduce + +```bash +# Pull both models +ollama pull llama3.2:3b +ollama pull gemma3:4b + +# Start server +ollama serve & + +# Install deps +pip install lm_eval langdetect immutabledict nltk +python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('stopwords')" + +# Run benchmark (100 samples each, ~10 min for llama3.2:3b) +python3 -m lm_eval \ + --model local-chat-completions \ + --model_args "model=llama3.2:3b,base_url=http://localhost:11434/v1/chat/completions,num_concurrent=1,max_retries=3,tokenized_requests=False" \ + --tasks gsm8k,ifeval \ + --num_fewshot 0 \ + --limit 100 \ + --apply_chat_template \ + --output_path /tmp/lmeval_results/llama32_3b +``` + +> **Note:** Use `--model local-chat-completions` (not `openai-chat-completions`). The `base_url` must be the full chat completions path. Loglikelihood tasks (ARC, HellaSwag, MMLU) raise `NotImplementedError` for chat models; only `generate_until` tasks like `gsm8k` and `ifeval` work. + +--- + +## Token Overhead by Scenario (Ollama Measurement) + +Measured April 2026 using Ollama `prompt_eval_count` — the actual number of prompt tokens processed. Prompt: "What is the total floor area of the current building?" Three runs per scenario, median reported. + +| Scenario | Tools | llama3.2:3b tokens | Delta | First-call latency | +|---|---|---|---|---| +| No tools (baseline) | 0 | 36 | — | 0.14s | +| 5 tools | 5 | 529 | +493 | 0.07s | +| 30 tools | 30 | 2,579 | +2,543 | 0.09s | +| 142 tools (synthetic, compact) | 142 | 11,763 | +11,727 | 0.19s | +| **142 tools (real openstudio-mcp)** | **142** | **~29,000** | **~28,964** | ~0.2s | + +The synthetic tools used in the Ollama benchmark averaged ~83 tokens each (compact schema). Real openstudio-mcp tools average **~204 tokens each** (detailed descriptions + parameter schemas), so the real delta scales to approximately **~29K tokens** — consistent with the live server measurements above. + +**gemma3:4b:** baseline only (36 → 20 tokens, no-tools prompt). All tool-bearing requests returned HTTP 400 — `gemma3:4b` does not support tool calling in Ollama's `/api/chat` endpoint. Cannot measure MCP overhead with this model via Ollama. + +--- + +## Measuring Schema Size + +To reproduce the schema measurements in this document, run against the live MCP server: + +```python +import subprocess, json + +cmd = ["docker", "run", "--rm", "-i", "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp"] + +init_msg = json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize", + "params": {"protocolVersion": "2024-11-05", "capabilities": {}, + "clientInfo": {"name": "measure", "version": "1.0"}}}) + "\n" +list_msg = json.dumps({"jsonrpc": "2.0", "id": 2, "method": "tools/list", + "params": {}}) + "\n" + +proc = subprocess.run(cmd, input=(init_msg + list_msg).encode(), + capture_output=True, timeout=30) +for line in proc.stdout.decode().split("\n"): + try: + obj = json.loads(line) + if obj.get("id") == 2: + tools = obj["result"]["tools"] + schema_json = json.dumps(tools) + print(f"Tools: {len(tools)}") + print(f"Full JSON chars: {len(schema_json):,}") + print(f"Est tokens: {len(schema_json)//4:,}") + desc_chars = sum(len(t.get("description","")) for t in tools) + print(f"Descriptions only: {desc_chars:,} chars / ~{desc_chars//4:,} tokens") + except Exception: + pass +``` + +--- + +## Evaluation Checklist + +When comparing client performance against openstudio-mcp, measure: + +- [ ] **First tool call latency** — time from prompt to first tool invocation +- [ ] **Schema token overhead** — use the script above; compare to client's token counter +- [ ] **ToolSearch calls per workflow** — how often the model searches before acting +- [ ] **Accuracy at turn 5 vs. turn 20** — does accuracy degrade in long sessions? +- [ ] **Failure mode when context is full** — does the model warn, truncate, or silently fail? +- [ ] **`list_skills` adherence** — does the model follow the skill guide or guess tool params? +- [ ] **Trace latency** — instrument with `TRACELOOP_BASE_URL` + Jaeger to see per-tool call times (see [`docker/docker-compose.tracing.yml`](../../docker/docker-compose.tracing.yml)) + +See [`docs/testing/advanced-evaluation-template.md`](../testing/advanced-evaluation-template.md) for a full structured evaluation form. diff --git a/docs/clients/vs-code-copilot.md b/docs/clients/vs-code-copilot.md new file mode 100644 index 0000000..1b5cf26 --- /dev/null +++ b/docs/clients/vs-code-copilot.md @@ -0,0 +1,106 @@ +# VS Code Copilot Setup + +> **Last verified:** April 2026 · VS Code 1.99+ · [Docs](https://code.visualstudio.com/docs/copilot/chat/mcp-servers) + +VS Code Copilot (GitHub Copilot Chat in agent mode) supports MCP servers from VS Code 1.99 onward. The 128-tool hard limit is within openstudio-mcp's 142-tool count, so you need to either disable 14+ tools via the UI or use the workspace config's tool filtering. + +--- + +## Prerequisites + +- **Docker Desktop** running +- **VS Code 1.99 or later** ([download](https://code.visualstudio.com/)) +- **GitHub Copilot** extension installed and active subscription +- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` + +--- + +## Configuration + +VS Code uses `.vscode/mcp.json` (workspace) or the user-profile MCP config (global). **Note:** VS Code uses the key `"servers"`, not `"mcpServers"` — this is different from Claude Desktop and Windsurf. + +**Workspace config** (`.vscode/mcp.json` in your project root — can be committed): + +```json +{ + "servers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/your/inputs:/inputs", + "-v", "/absolute/path/to/your/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} +``` + +**Global config** (all workspaces): open the Command Palette (`Cmd/Ctrl+Shift+P`) and run `MCP: Open User Configuration`. The file uses the same `"servers"` key format. + +--- + +## Enabling MCP in Agent Mode + +MCP tools are only available in **GitHub Copilot Chat agent mode** (`@workspace` / `@agent`). Regular inline completions do not use MCP tools. + +1. Open GitHub Copilot Chat (`Ctrl+Alt+I` / `Cmd+Ctrl+I`) +2. Switch to agent mode with the dropdown in the chat panel header +3. Click **Configure Tools** (wrench icon) to see all available MCP tools and toggle them on/off +4. Send a test prompt: + + > *"Use openstudio-mcp to create an example building model and describe it."* + +--- + +## Handling the 128-Tool Limit + +openstudio-mcp provides 142 tools. VS Code Copilot has a 128-tool hard cap across all active MCP servers. If you have other MCP servers enabled, the limit applies to the combined total. + +**Option A: Disable low-priority tools via the UI** + +In the Configure Tools panel, disable tools you won't use. Good candidates to disable for a first evaluation: +- `add_pv_to_shading`, `add_ev_load`, `set_lifecycle_cost_params` (renewables/cost, if not needed) +- `inspect_osm_summary`, `validate_osw`, `run_osw` (advanced file ops) +- `add_design_day` (if using `change_building_location` instead) + +**Option B: Use `includeTools` in the config** *(when supported — check VS Code release notes)* + +Some VS Code versions support an `includeTools` array to pre-filter exposed tools before the 128 limit is applied. Check the [MCP configuration reference](https://code.visualstudio.com/docs/copilot/reference/mcp-configuration) for the current schema. + +--- + +## First Prompts + +Use these in agent mode (`@agent`): + +``` +@agent Create a baseline 10-zone office building with System 7 VAV reheat +@agent Load /inputs/MyBuilding.osm and list all thermal zones with their setpoints +@agent Run a simulation on /runs/baseline.osm and show me the EUI +``` + +--- + +## Context & Performance Notes + +VS Code Copilot's context window depends on the model selected in the chat panel: +- GPT-4.1: 128K tokens +- Claude 3.7 Sonnet: 200K tokens +- Gemini 2.0 Flash: 1M tokens + +With 128 tools loaded, schema overhead is roughly **13–14K tokens** (reduced slightly from the full 142). See [Token Context & Performance](./token-context-performance.md) for a full breakdown. + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|-------------|-----| +| No MCP tools appear | Using wrong key (`mcpServers` instead of `servers`) | Check `.vscode/mcp.json` uses `"servers"` | +| "Trust this server?" prompt blocks startup | New server security check | Click "Trust" to allow the server to start | +| Tools appear but agent ignores them | Not in agent mode | Switch chat panel to agent mode | +| 128-tool limit error | Too many tools across all servers | Disable low-priority tools via Configure Tools panel | +| Config not picked up | Wrong file location | Confirm `.vscode/mcp.json` exists in the workspace root you opened | diff --git a/docs/clients/windsurf.md b/docs/clients/windsurf.md new file mode 100644 index 0000000..86b3330 --- /dev/null +++ b/docs/clients/windsurf.md @@ -0,0 +1,119 @@ +# Windsurf (Cascade) Setup + +> **Last verified:** April 2026 · Windsurf latest · [Docs](https://docs.windsurf.com/windsurf/cascade/mcp) + +Windsurf's Cascade AI supports MCP via a global config file. The **100-tool hard limit** means openstudio-mcp is not plug-and-play — you must disable at least 42 tools before Cascade will connect. This guide covers which tools to keep for common BEM workflows. + +--- + +## Prerequisites + +- **Docker Desktop** running +- **Windsurf** installed ([download](https://windsurf.com/download)) +- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` + +--- + +## Configuration + +Edit (or create) `~/.codeium/windsurf/mcp_config.json`: + +```json +{ + "mcpServers": { + "openstudio-mcp": { + "command": "docker", + "args": [ + "run", "--rm", "-i", + "-v", "/absolute/path/to/your/inputs:/inputs", + "-v", "/absolute/path/to/your/runs:/runs", + "-e", "OPENSTUDIO_MCP_MODE=prod", + "openstudio-mcp:dev", "openstudio-mcp" + ] + } + } +} +``` + +After saving, open the **MCP panel** in Windsurf (click the MCPs icon in the top-right Cascade panel). The openstudio-mcp server will appear but will be over the 100-tool limit. Proceed to tool selection below. + +--- + +## Selecting Tools (Required) + +Cascade has a hard cap of 100 active tools across all MCP servers. From the MCP settings page, toggle tools off until you are at or below 100. The table below shows a **recommended 80-tool starter set** organized by workflow. + +### Always Keep (Core — 22 tools) + +| Tool | Why | +|------|-----| +| `list_skills`, `get_skill` | Workflow guides — most important for BEM orientation | +| `get_server_status`, `get_versions` | Health checks | +| `load_osm_model`, `save_osm_model`, `create_example_osm`, `create_baseline_osm` | Model load/save | +| `get_model_summary`, `get_building_info` | Model inspection | +| `list_thermal_zones`, `list_spaces`, `list_air_loops`, `list_plant_loops` | Inventory tools | +| `run_simulation`, `get_run_status` | Simulation | +| `extract_summary_metrics`, `extract_end_use_breakdown` | Results | +| `validate_model` | Pre-sim QA | +| `recommend_tools` | Tool router | +| `change_building_location` | Weather setup | + +### Add by Workflow + +| Workflow | Additional Tools to Enable | +|----------|--------------------------| +| New building from scratch | `create_new_building`, `create_bar_building`, `create_typical_building` | +| HVAC changes | `add_baseline_system`, `list_baseline_systems`, `add_air_loop`, `list_zone_hvac_equipment`, `set_component_properties` | +| Envelope work | `list_surfaces`, `list_subsurfaces`, `replace_window_constructions`, `get_construction_details`, `list_materials` | +| Measures | `create_measure`, `test_measure`, `apply_measure`, `list_custom_measures` | +| Results deep-dive | `extract_hvac_sizing`, `extract_envelope_summary`, `extract_zone_summary`, `compare_runs`, `generate_results_report` | +| Schedules/loads | `list_thermal_zones` (detailed), `adjust_thermostat_setpoints`, `get_schedule_details` | + +### Safe to Disable + +Low-use tools that can be re-enabled on demand: +- `add_pv_to_shading`, `add_rooftop_pv`, `add_ev_load` — renewables +- `set_lifecycle_cost_params`, `add_cost_per_floor_area` — lifecycle costing +- `set_adiabatic_boundaries` — special boundary conditions +- `inspect_osm_summary`, `validate_osw`, `run_osw` — raw file operations +- `import_floorspacejs` — only if not doing custom geometry imports +- `add_design_day` — only if not defining custom design days + +--- + +## Verification + +After configuring your tool selection: + +1. Restart Windsurf or click "Refresh" in the MCP panel +2. Confirm the tool count shows ≤ 100 in the panel +3. Send a test prompt in Cascade: + + > *"Use the openstudio MCP tools to list the available skills."* + +--- + +## First Prompts + +``` +"Create a baseline 5-zone office building with VAV reheat using openstudio-mcp" +"Load /inputs/MyBuilding.osm and tell me about its HVAC system" +"Run a simulation on /runs/model.osm and extract the EUI" +``` + +--- + +## Context & Performance Notes + +Windsurf puts all enabled tool schemas into context without deferral, similar to Claude Desktop. With a curated 80-tool set, schema overhead is approximately **9–11K tokens**. See [Token Context & Performance](./token-context-performance.md). + +--- + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---------|-------------|-----| +| "Tool limit exceeded" error | More than 100 tools enabled | Disable tools via MCP settings panel | +| Server listed but tools not available | Tool count > 100 (Cascade rejects the whole server) | Must disable tools before connecting | +| Config not picked up | Wrong path | Confirm `~/.codeium/windsurf/mcp_config.json` — note: `windsurf/` subdirectory, not `codeium/` directly | +| Cascade uses its own tools instead of MCP | Prompt doesn't mention MCP | Include "use openstudio-mcp" or "use the openstudio tools" in your prompt | diff --git a/docs/examples/20_deep_retrofit_package.md b/docs/examples/20_deep_retrofit_package.md new file mode 100644 index 0000000..47cb8a7 --- /dev/null +++ b/docs/examples/20_deep_retrofit_package.md @@ -0,0 +1,148 @@ +# Example 20: Hackathon — Deep Retrofit Package Analysis + +Stack energy conservation measures on a baseline office building, simulate before and after, and quantify cumulative savings — all in a single AI-assisted session. + +## Scenario + +A hackathon team wants to find the maximum achievable energy savings from bundling multiple ECMs on a small commercial office building in Boston. They have four hours to model, simulate, and present results. + +The **core package** (validated by integration test) stacks: +1. **High-R wall insulation** — upgrade all exterior walls to R-20 +2. **Thermostat deadband widening** — expand heating/cooling deadband by 2°F each way + +**Optional extensions** (require OpenStudio extension gem environment): +3. **High-performance windows** — replace all glazing with a better-performing construction +4. **Rooftop photovoltaics** — add PV panels on 75% of roof area + +## Prompt + +> Create a small office building in Boston. Run a baseline simulation, then stack high-R wall insulation and thermostat widening, re-simulate, and show me the savings from the package. + +## Tool Call Sequence + +### Step 1 — Baseline Model + Simulation + +``` +1. create_baseline_osm(name="office_baseline", ashrae_sys_num="03", wwr=0.4) +2. load_osm_model(osm_path=) +3. change_building_location(weather_file="/inputs/USA_MA_Boston-Logan.Intl.AP.725090_TMY3.epw") +4. save_osm_model(osm_path="/runs/office_baseline.osm") +5. run_simulation(osm_path="/runs/office_baseline.osm") +6. get_run_status(run_id=) # poll until "success" +7. extract_summary_metrics(run_id=) # record baseline EUI +``` + +### Step 2 — ECM 1: High-R Wall Insulation + +Create an R-20 wall construction and assign it to every exterior wall. + +``` +8. create_standard_opaque_material( + name="R20_Insulation", + thickness_m=0.141, # R-20 IP: k=0.04 W/m-K → 3.52 m²·K/W + conductivity_w_m_k=0.04, + density_kg_m3=30.0, + specific_heat_j_kg_k=1000.0) +9. create_construction(name="High_R_Wall", material_names=["R20_Insulation"]) +10. get_construction_details(construction_name="High_R_Wall") # verify R-value +11. list_surfaces(surface_type="Wall", boundary="Outdoors", max_results=0) + # For each exterior wall surface name: +12. assign_construction_to_surface( + surface_name=, construction_name="High_R_Wall") + ... repeat for each exterior wall ... +``` + +### Step 3 — ECM 2: Thermostat Deadband Widening + +Expand the heating/cooling deadband by 2°F in each direction. + +``` +13. adjust_thermostat_setpoints(cooling_offset_f=2.0, heating_offset_f=-2.0) +``` + +### Step 4 — Retrofit Simulation + Comparison + +``` +14. save_osm_model(osm_path="/runs/office_retrofit.osm") +15. run_simulation(osm_path="/runs/office_retrofit.osm") +16. get_run_status(run_id=) # poll until "success" +17. compare_runs( + baseline_run_id=, + retrofit_run_id=) +``` + +### Optional Extensions + +#### ECM 3: High-Performance Windows (requires gem environment) + +Use a `SimpleGlazing` material (U-factor + SHGC) to define window performance, then replace all glazing: + +``` +18. create_measure(name="add_simple_glazing", ...) # write a measure that calls + # SimpleGlazing.new(model, u_factor, shgc) and wires it into a Construction +19. apply_measure(measure_dir=..., arguments={u_factor: 1.2, shgc: 0.25}) +20. replace_window_constructions(construction_name="HighPerf_Window") +``` + +> **Note:** `create_standard_opaque_material` creates wall/roof layers — not glazing. +> Window constructions require glazing materials (SimpleGlazing or StandardGlazing) +> authored via `create_measure`. See Example 01 for the measure-authoring pattern. + +#### ECM 4: Rooftop PV (requires gem environment) + +``` +19. add_rooftop_pv(fraction_of_surface=0.75, cell_efficiency=0.18) +``` + +> **Note on PV comparison:** `compare_runs` tracks end-use energy consumption. +> For PV, also check `extract_summary_metrics` to see the site energy reduction +> that accounts for onsite generation. + +## Expected Results + +| Scenario | EUI (kBtu/ft²/yr) | Savings vs. Baseline | +|----------|-------------------|----------------------| +| Baseline (System 3 PSZ-AC, Boston) | ~50–65 | — | +| + High-R Insulation | ~48–62 | ~3–6% | +| + Thermostat Widening (cumulative) | ~44–57 | ~8–15% | +| + Window Upgrade (if available) | ~42–54 | ~12–18% | +| + Rooftop PV offset | ~37–49 | ~18–28% | + +*Exact values depend on model geometry and weather year.* + +## Key Tools Used + +| Tool | Purpose | +|------|---------| +| `create_baseline_osm` | 10-zone model with PSZ-AC system and glazing | +| `change_building_location` | Weather file + design days (Boston TMY3) | +| `create_standard_opaque_material` | Define insulation layer (k, density, Cp, thickness) | +| `create_construction` | Assemble opaque material layers into a wall construction | +| `get_construction_details` | Verify R-value of new assembly | +| `list_surfaces` | Find all exterior walls by boundary condition | +| `assign_construction_to_surface` | Apply new construction wall-by-wall | +| `adjust_thermostat_setpoints` | Widen heating/cooling deadband | +| `replace_window_constructions` | Swap all window glazing in one call (optional) | +| `add_rooftop_pv` | Add PV panels on roof shading surfaces (optional) | +| `run_simulation` | Launch EnergyPlus | +| `compare_runs` | EUI delta + per-end-use breakdown between two runs | + +## Why This is a Great Hackathon Demo + +This workflow demonstrates the unique value of AI-assisted building energy modeling: + +- **Manual equivalent**: 4–8 hours per scenario in the OpenStudio GUI +- **With openstudio-mcp**: All ECMs modeled, simulated, and compared in minutes +- **Narrative**: "Here's the pathway to 15% savings from just two measures" +- **Extensibility**: Swap in any building type, location, or ECM package + +## Notes + +- Save the baseline model **before** applying any ECMs — the ECMs modify in-memory state +- ECM 1 uses a single-layer R-20 construction as a simplified approximation of a real multi-layer wall assembly; real projects would retain the original assembly and add an insulation layer +- `compare_runs` reports `energy_grand_total_kBtu` (consumption only, excludes water); for PV-inclusive scenarios, use `extract_summary_metrics` site energy instead +- `replace_window_constructions` and `add_rooftop_pv` require OpenStudio extension gems; the core ECMs (insulation + thermostat) work with the SDK alone + +## Integration Test + +See `tests/test_skill_ecm_package.py::test_skill_ecm_package_workflow` diff --git a/docs/testing/advanced-evaluation-template.md b/docs/testing/advanced-evaluation-template.md new file mode 100644 index 0000000..e615922 --- /dev/null +++ b/docs/testing/advanced-evaluation-template.md @@ -0,0 +1,154 @@ +# OpenStudio-MCP: Advanced Evaluation & Workflow Log + +**Date:** [YYYY-MM-DD] +**Evaluator:** [Your Name] +**Session ID/Commit:** [Insert branch or commit hash] + +--- + +## How to Use This Template + +This template guides manual evaluation of the `openstudio-mcp` server across six areas. +Use it alongside the automated test suite — the tests cover deterministic behavior; +this template captures LLM-specific behavior that cannot be unit tested. + +**Suggested time allocation (20-hour eval):** + +| Hours | Focus | +|---|---| +| 1–5 | Sections 1 & 2 — Does the LLM use Skills correctly? | +| 6–12 | Sections 4 & 5 — Long-session state stability + practitioner workflows | +| 13–17 | Section 3 — Artifact size limits, where does it break? | +| 18–20 | Section 6 + write-up, draft SECURITY.md updates if gaps found | + +--- + +## 1. Environment & Setup Adherence + +Testing the "onboarding" experience. Does the LLM correctly identify the environment +and its capabilities? + +| MCP Client | LLM Model | Initial Tool Discovery | Did it call `list_skills`? | Setup Friction | +|---|---|---|---|---| +| Claude Desktop | Claude 3.5 Sonnet | Ad-hoc / Skills-based | [Yes/No] | [e.g. README missing deps] | +| Cursor | GPT-4o | Ad-hoc / Skills-based | [Yes/No] | [e.g. Prompting issues] | + +**Automated coverage:** `tests/test_skill_registration.py`, `tests/test_skill_docs.py` + +--- + +## 2. The "Skills" Orchestration Layer + +Instead of consolidating tools, we test how well the LLM uses the provided Markdown "Skills" +to navigate 126+ specialized tools. + +**Test Goal:** Does the agent follow the "Skill" guide or try to "guess" tool parameters? + +| Target Workflow | Skill Used | Adherence Score (1–5) | Observation / Hallucination | +|---|---|---|---| +| HVAC Swap | `add-hvac` | [Score] | [e.g. LLM ignored skill and guessed VAV parameters] | +| Geometry Edit | `tool-workflows` | [Score] | [e.g. Successfully followed skill sequence] | +| Simulation Run | `tool-workflows` (simulate) | [Score] | [e.g. Tried to run sim before loading model] | +| Baseline Generation | `ashrae-baseline-guide` | [Score] | [e.g. Wrong system type selected] | +| QAQC | `qaqc` | [Score] | [e.g. Ran checks on unsimulated model] | + +**Scoring guide:** +- 5 — Followed skill verbatim, correct tool order, correct parameters +- 4 — Minor deviation (e.g., skipped one optional step), correct outcome +- 3 — Partial adherence; required correction prompt +- 2 — Mostly guessed; skill ignored +- 1 — Completely wrong tool sequence or hallucinated parameters + +**Automated coverage:** `tests/llm/` suite (tool selection, routing, progressive workflows) + +--- + +## 3. Data & Artifact Management (Breaking Points) + +Testing the limits of `read_file` vs. `copy_file` for large Building Energy Modeling outputs. + +| Artifact Type | File Size | Tool Used | Result (Success/Truncated/Crash) | Context Usage | +|---|---|---|---|---| +| `eplusout.err` | [e.g. 50 KB] | `read_file` | Success | [Token Count] | +| `eplusout.html` | [e.g. 2.5 MB] | `read_file` | [e.g. Truncated at 50 KB] | [Token Count] | +| `eplusout.html` | [e.g. 2.5 MB] | `copy_file` | [e.g. Successful Local Copy] | N/A | +| `eplusout.eso` | [e.g. 15 MB] | `read_file` | [e.g. Truncated] | [Token Count] | +| `eplusout.eso` | [e.g. 15 MB] | `copy_file` | [e.g. Successful Local Copy] | N/A | + +**Note on Artifact Limits:** At what point did the LLM lose the ability to analyze +simulation errors? Document the exact file size and token count where analysis degraded. + +**Key behavior to verify:** +- `read_file` returns `truncated: true` and `bytes_read` when file exceeds `max_bytes` (default 50 KB) +- `copy_file` has no built-in size limit, but can still fail on insufficient disk space, + permission errors, or same-filesystem copy constraints — check `ok` field in the response +- Chunked reading via `offset` parameter allows paginating through large files + +**Automated coverage:** `tests/test_artifact_limits.py` (on `test/artifact-security-coverage` branch) + +--- + +## 4. In-Memory Session & State Persistence + +Testing the reliability of the SWIG-wrapped in-memory model manager over long, +high-turn conversations. + +| Total Turns | Model Size (.osm) | Persistence Check | Did it "drop" the model state? | +|---|---|---|---| +| 5 Turns | 120 KB | Pass — Changes Kept | No | +| 15 Turns | 120 KB | [Pass/Fail] | [e.g. Session timed out/cleared] | +| 25+ Turns | [Size] | [Pass/Fail] | [e.g. SWIG Memory Leak Warning Observed] | + +**What to watch for:** +- "No model loaded" errors appearing mid-session after successful model operations +- SWIG `memory leak of type 'boost::optional...'` warnings in stderr +- Model state diverging (e.g., a zone renamed in turn 3 showing original name in turn 20) + +**Automated coverage:** `tests/test_session_persistence.py` (on `test/artifact-security-coverage` branch; 20+ sequential operations) + +--- + +## 5. BEM Practitioner Workflow (Visual Case Study) + +### Workflow Name: [e.g. ASHRAE 90.1 Baseline Generation] + +**Objective:** [Briefly describe the practitioner's goal] + +**Step-by-Step Execution:** + +1. **User Request:** [Insert Prompt] +2. **Skill Triggered:** [Insert Skill Name] +3. **Tool Chain:** [List tools called in sequence] +4. **Outcome:** [Brief summary of BEM result] + +**Visual Documentation:** + +> *Screenshot: LLM following the "Skill" Markdown instructions.* +> `![Skill Adherence](./images/skill_ui.png)` + +> *Screenshot: Final BEM result or 3D visualization verification.* +> `![BEM Verification](./images/result_viz.png)` + +--- + +## 6. Security & Path Validation + +Quick check for path-traversal vulnerabilities or container leaks. + +- **[ ]** Attempted path traversal (`../../etc/passwd`) via `file_path`? **Result:** [Blocked/Allowed] +- **[ ]** Attempted path traversal in `copy_file` `destination`? **Result:** [Blocked/Allowed] +- **[ ]** Verified that `copy_file` stays within mounted volume? **Result:** [Yes/No] +- **[ ]** Attempted to read `/repo` source code via `read_file`? **Result:** [Allowed — `/repo` is intentionally in allowed roots so skill guides and measure templates in the source tree are accessible; be aware this also exposes server source code] +- **[ ]** Attempted `seed_file: "../../model.osm"` in OSW? **Result:** [Path flattened to basename] + +**Automated coverage:** `tests/test_path_safety.py`, `tests/test_artifact_limits.py` +(both on `test/artifact-security-coverage` branch; see `TestCopyFilePathSafety` class) + +--- + +## Summary & Recommendations + +| Finding | Severity | Recommended Action | +|---|---|---| +| [e.g. LLM skips list_skills on HVAC tasks] | Medium | [e.g. Add skill reminder to tool descriptions] | +| [e.g. 2.5 MB HTML truncated, analysis lost] | High | [e.g. LLM should proactively use copy_file for >50 KB] | diff --git a/mcp_server/server.py b/mcp_server/server.py index 290156f..935f790 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1,63 +1,68 @@ from __future__ import annotations -from fastmcp import FastMCP - from mcp_server.config import ENABLE_CODE_MODE from mcp_server.skills import register_all_skills from mcp_server.stdout_suppression import ( redirect_c_stdout_to_stderr, silence_openstudio_stdout_logger, ) - -mcp = FastMCP( - "openstudio-mcp", - instructions=( - "Building energy simulation server (OpenStudio SDK) with 142 tools for " - "creating, modifying, simulating, and analyzing building energy models. " - "Use these tools for all building energy modeling tasks — if no tool " - "exists for a task, ask the user before writing code. " - "NEVER write scripts, code, or files to accomplish tasks that these " - "tools already handle. Specifically: " - "- Measures: ALWAYS use create_measure — never write measure.rb/.py/.xml " - "directly. create_measure handles scaffolding, XML, checksums, and " - "OS App compatibility. Workflow: create_measure → test_measure → apply_measure. " - "- Results/data: use extract_summary_metrics, extract_end_use_breakdown, " - "query_timeseries, extract_envelope_summary, extract_hvac_sizing — " - "never write Python/SQL scripts to parse eplusout.sql. " - "- Visualization: use view_model (3D geometry), view_simulation_data " - "(charts/heatmaps), generate_results_report (HTML report) — never write " - "matplotlib/plotly/HTML scripts. " - "- Models: use create_new_building, create_bar_building, import_floorspacejs " - "— never write raw IDF or OSM files. " - "- Weather: use change_building_location (sets EPW+DDY+CZ in one call) " - "or list_weather_files — never download or write weather files. " - "- HVAC: use add_baseline_system, add_doas_system, add_vrf_system — " - "never write OpenStudio SDK scripts to wire HVAC components. " - "For custom HVAC measures, call search_wiring_patterns to get working " - "Ruby wiring code, and search_api to verify methods exist. " - "If a file path is given, use it directly. If a file operation fails, " - "you may call list_files once to find the right path, then retry — " - "do not call list_files more than once for the same file. " - "Use list_weather_files for EPW discovery — do not use list_files for weather. " - "To find objects by type, use list_model_objects(object_type). " - "List tools default to 10 results — use filters to narrow, or " - "max_results=0 for all. Prefer list tools before detail tools to " - "find the right name. " - "When polling get_run_status, wait at least 1-2 minutes between calls. " - "For multi-step workflows, call list_skills() first." - ), -) - -register_all_skills(mcp) - -if ENABLE_CODE_MODE: - from fastmcp.experimental.transforms.code_mode import CodeMode - mcp.add_transform(CodeMode()) +from mcp_server.telemetry import init_telemetry def main(): silence_openstudio_stdout_logger() redirect_c_stdout_to_stderr() + # init_telemetry() must run before FastMCP is instantiated so that + # McpInstrumentor().instrument() can patch FastMCP.__init__ in time. + init_telemetry() + + from fastmcp import FastMCP + + mcp = FastMCP( + "openstudio-mcp", + instructions=( + "Building energy simulation server (OpenStudio SDK) with 142 tools for " + "creating, modifying, simulating, and analyzing building energy models. " + "Use these tools for all building energy modeling tasks — if no tool " + "exists for a task, ask the user before writing code. " + "NEVER write scripts, code, or files to accomplish tasks that these " + "tools already handle. Specifically: " + "- Measures: ALWAYS use create_measure — never write measure.rb/.py/.xml " + "directly. create_measure handles scaffolding, XML, checksums, and " + "OS App compatibility. Workflow: create_measure → test_measure → apply_measure. " + "- Results/data: use extract_summary_metrics, extract_end_use_breakdown, " + "query_timeseries, extract_envelope_summary, extract_hvac_sizing — " + "never write Python/SQL scripts to parse eplusout.sql. " + "- Visualization: use view_model (3D geometry), view_simulation_data " + "(charts/heatmaps), generate_results_report (HTML report) — never write " + "matplotlib/plotly/HTML scripts. " + "- Models: use create_new_building, create_bar_building, import_floorspacejs " + "— never write raw IDF or OSM files. " + "- Weather: use change_building_location (sets EPW+DDY+CZ in one call) " + "or list_weather_files — never download or write weather files. " + "- HVAC: use add_baseline_system, add_doas_system, add_vrf_system — " + "never write OpenStudio SDK scripts to wire HVAC components. " + "For custom HVAC measures, call search_wiring_patterns to get working " + "Ruby wiring code, and search_api to verify methods exist. " + "If a file path is given, use it directly. If a file operation fails, " + "you may call list_files once to find the right path, then retry — " + "do not call list_files more than once for the same file. " + "Use list_weather_files for EPW discovery — do not use list_files for weather. " + "To find objects by type, use list_model_objects(object_type). " + "List tools default to 10 results — use filters to narrow, or " + "max_results=0 for all. Prefer list tools before detail tools to " + "find the right name. " + "When polling get_run_status, wait at least 1-2 minutes between calls. " + "For multi-step workflows, call list_skills() first." + ), + ) + + register_all_skills(mcp) + + if ENABLE_CODE_MODE: + from fastmcp.experimental.transforms.code_mode import CodeMode + mcp.add_transform(CodeMode()) + mcp.run() diff --git a/mcp_server/skills/common_measures/wrappers.py b/mcp_server/skills/common_measures/wrappers.py index 6b08625..caed813 100644 --- a/mcp_server/skills/common_measures/wrappers.py +++ b/mcp_server/skills/common_measures/wrappers.py @@ -11,6 +11,7 @@ from typing import Any from mcp_server.skills.measures.operations import apply_measure +from mcp_server.telemetry import traced def _ensure_climate_zone() -> None: @@ -154,6 +155,7 @@ def generate_results_report_op(units: str = "IP", run_id: str | None = None) -> # --- 4. run_qaqc_checks: ASHRAE QA/QC --- +@traced() def run_qaqc_checks_op( template: str = "90.1-2013", checks: list[str] | None = None, diff --git a/mcp_server/skills/comstock/operations.py b/mcp_server/skills/comstock/operations.py index 0a07fd0..03935a5 100644 --- a/mcp_server/skills/comstock/operations.py +++ b/mcp_server/skills/comstock/operations.py @@ -18,6 +18,7 @@ from mcp_server.config import RUN_ROOT from mcp_server.model_manager import get_model from mcp_server.skills.measures.operations import apply_measure +from mcp_server.telemetry import traced # Category classification for ComStock measures _BASELINE_PREFIXES = ( @@ -115,6 +116,7 @@ def list_comstock_measures(category: str | None = None) -> dict[str, Any]: } +@traced() def create_typical_building( template: str = "90.1-2019", building_type: str = "SmallOffice", @@ -299,6 +301,7 @@ def _create_empty_model() -> Path: return osm_path +@traced() def create_bar_building( building_type: str = "SmallOffice", total_bldg_floor_area: float = 10000, @@ -407,6 +410,7 @@ def create_bar_building( return result +@traced() def create_new_building( # Bar geometry args building_type: str = "SmallOffice", diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py index 189ab9d..ffccda9 100644 --- a/mcp_server/skills/measure_authoring/operations.py +++ b/mcp_server/skills/measure_authoring/operations.py @@ -14,6 +14,7 @@ import openstudio from mcp_server.config import INPUT_ROOT, RUN_ROOT +from mcp_server.telemetry import traced CUSTOM_MEASURES_DIR = RUN_ROOT / "custom_measures" @@ -740,6 +741,7 @@ def _write_test_file(measure_dir: Path, class_name: str, args: list[dict], # ── Public operations ──────────────────────────────────────────────── +@traced() def create_measure_op( name: str, description: str, diff --git a/mcp_server/skills/measures/operations.py b/mcp_server/skills/measures/operations.py index 52a0c4d..a1238a2 100644 --- a/mcp_server/skills/measures/operations.py +++ b/mcp_server/skills/measures/operations.py @@ -18,6 +18,7 @@ from mcp_server.config import OSCLI_GEM_PATH, OSCLI_GEMFILE, RUN_ROOT from mcp_server.model_manager import get_model, load_model +from mcp_server.telemetry import traced from mcp_server.util import resolve_run_dir @@ -115,6 +116,7 @@ def _parse_runner_messages(out_osw_path: Path) -> dict[str, Any] | None: return None +@traced() def apply_measure( measure_dir: str, arguments: dict[str, Any] | None = None, diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py index 46bdfcd..34673bd 100644 --- a/mcp_server/skills/simulation/operations.py +++ b/mcp_server/skills/simulation/operations.py @@ -14,6 +14,7 @@ import psutil from mcp_server.config import LOG_TAIL_DEFAULT, OSCLI_GEM_PATH, OSCLI_GEMFILE, RUN_ROOT +from mcp_server.telemetry import traced from mcp_server.util import resolve_run_dir # Where the MCP server stores runs inside the container @@ -603,6 +604,7 @@ def validate_model_op() -> dict[str, Any]: } +@traced() def run_simulation(osm_path: str, epw_path: str | None = None, name: str | None = None) -> dict[str, Any]: """Create a minimal OSW from an OSM file and run the simulation. diff --git a/mcp_server/telemetry.py b/mcp_server/telemetry.py new file mode 100644 index 0000000..3691bc2 --- /dev/null +++ b/mcp_server/telemetry.py @@ -0,0 +1,250 @@ +"""OpenLLMetry (Traceloop) tracing for the openstudio-mcp server. + +Optional: a no-op unless traceloop-sdk is installed (included in [telemetry] extra). +Zero overhead when absent: no import errors, all calls become pass-throughs. + +Install: + pip install 'openstudio-mcp[telemetry]' + +Environment variables: + TRACELOOP_BASE_URL OTLP / Traceloop-compatible endpoint, e.g.: + http://localhost:4318 (local OTEL collector) + https://api.traceloop.com (Traceloop cloud, needs API key) + Unset -> telemetry disabled (no-op). + TRACELOOP_API_KEY API key for Traceloop cloud (not required for generic OTLP). + OTEL_SERVICE_NAME Service name emitted on every span. Default: "openstudio-mcp". + OTEL_EXPORT_BATCH "false" -> sync exporting (dev). Default: batch mode. + TRACELOOP_TRACE_CONTENT "false" -> omit tool args from spans (privacy). + +Usage: + from mcp_server.telemetry import init_telemetry, trace_operation, traced + + # In main() before mcp.run(): + init_telemetry() + + # Decorate a key operation: + @traced() + def run_simulation(osm_path: str, ...) -> dict: ... + + # Or use a context manager for finer control: + with trace_operation("prepare_model", {"path": osm_path}) as span: + result = do_work() +""" +from __future__ import annotations + +import importlib.util +import json +import logging +import sys +from contextlib import contextmanager +from typing import Any, Callable, TypeVar + +logger = logging.getLogger(__name__) + +_TELEMETRY_INITIALIZED = False +# True only after Traceloop.init() succeeds with a valid endpoint. +# traced() checks this at call time to avoid traceloop stdout warnings. +_TELEMETRY_ENABLED = False +try: + _SDK_AVAILABLE = importlib.util.find_spec("traceloop.sdk") is not None +except (ModuleNotFoundError, ValueError): + _SDK_AVAILABLE = False + +# Max chars for any single span attribute value. +_MAX_ATTR_LEN = 512 + +F = TypeVar("F", bound=Callable[..., Any]) + + +class _NoopSpan: + """Minimal no-op span used when opentelemetry is not installed.""" + + def set_attribute(self, *args: Any, **kwargs: Any) -> None: + pass + + def set_status(self, *args: Any, **kwargs: Any) -> None: + pass + + def record_exception(self, *args: Any, **kwargs: Any) -> None: + pass + + +def init_telemetry() -> bool: + """Initialize OpenLLMetry tracing. Idempotent — safe to call multiple times. + + Returns True if telemetry was enabled, False otherwise (SDK absent, no endpoint). + When the SDK is installed and an endpoint is configured, calls + McpInstrumentor().instrument() to auto-trace every FastMCP tool call and + Traceloop.init() to configure the OTLP exporter. + """ + global _TELEMETRY_INITIALIZED, _TELEMETRY_ENABLED + + if _TELEMETRY_INITIALIZED: + return _TELEMETRY_ENABLED + + import os + + if not _SDK_AVAILABLE: + endpoint = os.environ.get("TRACELOOP_BASE_URL", "").strip() + if endpoint: + logger.warning( + "TRACELOOP_BASE_URL is set but traceloop-sdk is not installed. " + "Install telemetry extras: pip install 'openstudio-mcp[telemetry]'" + ) + _TELEMETRY_INITIALIZED = True + return False + + endpoint = os.environ.get("TRACELOOP_BASE_URL", "").strip() + if not endpoint: + logger.debug("TRACELOOP_BASE_URL not set -- telemetry disabled") + _TELEMETRY_INITIALIZED = True + return False + + try: + from opentelemetry.instrumentation.mcp import McpInstrumentor + from traceloop.sdk import Traceloop + + service_name = os.environ.get("OTEL_SERVICE_NAME", "openstudio-mcp") + disable_batch = os.environ.get("OTEL_EXPORT_BATCH", "true").lower() == "false" + + # Initialize Traceloop FIRST so its TracerProvider is live before we + # patch FastMCP. McpInstrumentor wraps FastMCP tool calls; if the + # provider isn't established yet those spans have nowhere to go. + # Traceloop.init() uses print() for status messages — redirect sys.stdout + # to stderr to avoid corrupting the MCP JSON-RPC stdio pipe. + _orig_stdout = sys.stdout + sys.stdout = sys.stderr + try: + Traceloop.init( + app_name=service_name, + api_endpoint=endpoint, + disable_batch=disable_batch, + ) + finally: + sys.stdout = _orig_stdout + + # Patch FastMCP AFTER the provider is live so auto-traced tool calls + # have a real exporter destination. + McpInstrumentor().instrument() + + _TELEMETRY_INITIALIZED = True + _TELEMETRY_ENABLED = True + logger.info( + "OpenLLMetry enabled: endpoint=%s service=%s batch=%s", + endpoint, + service_name, + not disable_batch, + ) + return True + + except Exception: + logger.exception("Failed to initialize OpenLLMetry -- telemetry disabled") + _TELEMETRY_INITIALIZED = True + return False + + +@contextmanager +def trace_operation(name: str, attributes: dict[str, Any] | None = None): + """Context manager that wraps a block in a child INTERNAL span. + + Uses the active OpenTelemetry TracerProvider (configured by Traceloop.init()). + Falls back to a no-op span when opentelemetry is not installed or telemetry + is not configured — safe to use unconditionally. + + Args: + name: Span name, e.g. "prepare_model". + attributes: Optional initial attributes (values truncated to _MAX_ATTR_LEN). + + Yields: + The active Span (may be a NonRecordingSpan or _NoopSpan when telemetry is off). + + Example:: + + with trace_operation("apply_measure", {"measure_dir": measure_dir}) as span: + result = _do_apply(...) + span.set_attribute("ok", str(result.get("ok", False))) + """ + try: + from opentelemetry import trace + from opentelemetry.trace import NonRecordingSpan, SpanKind, StatusCode + except ImportError: + yield _NoopSpan() + return + + tracer = trace.get_tracer("openstudio-mcp") + with tracer.start_as_current_span(name, kind=SpanKind.INTERNAL) as span: + is_recording = not isinstance(span, NonRecordingSpan) + if is_recording and attributes: + for key, val in attributes.items(): + span.set_attribute(key, _truncate(val)) + try: + yield span + except Exception as exc: + if is_recording: + span.record_exception(exc) + span.set_status(StatusCode.ERROR, str(exc)) + raise + + +def traced(op_name: str | None = None) -> Callable[[F], F]: + """Decorator that wraps a synchronous operation in a trace span. + + Uses trace_operation() context manager to create a span. Only active when + telemetry has been successfully enabled via init_telemetry(). This avoids + traceloop stdout warnings when the SDK is installed but no endpoint is set. + + Marks the span ERROR when the function returns a dict with ok=False. + + Args: + op_name: Span name override. Defaults to the function name. + + Example:: + + @traced() + def run_simulation(osm_path: str, ...) -> dict: ... + """ + import functools + + def decorator(fn: F) -> F: + span_name = op_name or fn.__name__ + + @functools.wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + if not _TELEMETRY_ENABLED: + return fn(*args, **kwargs) + + with trace_operation(span_name) as span: + result = fn(*args, **kwargs) + if isinstance(result, dict) and result.get("ok") is False: + _mark_span_error(span, result) + return result + + return wrapper # type: ignore[return-value] + + return decorator + + +def _mark_span_error(span: Any, result: dict[str, Any]) -> None: + """Set ERROR status on the given span.""" + try: + from opentelemetry.trace import NonRecordingSpan, StatusCode + + if isinstance(span, NonRecordingSpan): + return + error_msg = result.get("error") or result.get("message") or "tool returned ok=False" + span.set_status(StatusCode.ERROR, str(error_msg)) + span.set_attribute("error.message", str(error_msg)[:_MAX_ATTR_LEN]) + except Exception: + pass + + +def _truncate(value: Any) -> str: + """Serialize a value to a JSON string capped at _MAX_ATTR_LEN chars.""" + try: + s = json.dumps(value, default=str) + except Exception: + s = str(value) + if len(s) > _MAX_ATTR_LEN: + return s[:_MAX_ATTR_LEN] + "..." + return s + diff --git a/pyproject.toml b/pyproject.toml index 3e67bef..5e32ca8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,10 @@ dev = [ "pytest-timeout>=2.3.1", "mcp", "pyyaml>=6.0", + "opentelemetry-sdk>=1.38.0", +] +telemetry = [ + "traceloop-sdk>=0.49.2", ] [project.scripts] diff --git a/tests/test_skill_ecm_package.py b/tests/test_skill_ecm_package.py new file mode 100644 index 0000000..463ca65 --- /dev/null +++ b/tests/test_skill_ecm_package.py @@ -0,0 +1,237 @@ +"""Integration test for Example 20: Hackathon Deep Retrofit Package workflow. + +Exercises: create baseline → simulate → apply wall insulation ECM (all exterior +walls) + thermostat widening ECM → re-simulate → compare_runs shows energy reduction. + +This validates the core ECM stacking workflow described in +docs/examples/20_deep_retrofit_package.md. +""" +import asyncio +import uuid + +import pytest +from conftest import EPW_PATH, integration_enabled, poll_until_done, server_params, unwrap +from mcp import ClientSession +from mcp.client.stdio import stdio_client + + +@pytest.mark.integration +def test_skill_ecm_package_workflow(): + """Hackathon ECM package: baseline sim → wall insulation + thermostat ECMs → compare.""" + # Validates: multiple ECMs (exterior wall insulation + thermostat widening) can be + # stacked on a baseline model and produce a measurable energy reduction vs. baseline. + if not integration_enabled(): + pytest.skip("integration disabled") + + async def _run(): + async with stdio_client(server_params()) as (r, w): + async with ClientSession(r, w) as s: + await s.initialize() + name = f"ecm_pkg_{uuid.uuid4().hex[:8]}" + + # ---------------------------------------------------------------- + # Step 1: Create baseline model (System 3 PSZ-AC, 40% WWR) + # ---------------------------------------------------------------- + cr = unwrap(await s.call_tool("create_baseline_osm", { + "name": name, + "ashrae_sys_num": "03", + "wwr": 0.4, + })) + assert cr["ok"] is True, f"create_baseline_osm failed: {cr}" + + lr = unwrap(await s.call_tool("load_osm_model", { + "osm_path": cr["osm_path"], + })) + assert lr["ok"] is True + + # Step 2: Set weather + design days (Boston TMY3) + wr = unwrap(await s.call_tool("change_building_location", { + "weather_file": EPW_PATH, + })) + assert wr["ok"] is True, f"change_building_location failed: {wr}" + + # Step 3: Save baseline and simulate + baseline_path = f"/runs/{name}_baseline.osm" + sr = unwrap(await s.call_tool("save_osm_model", { + "osm_path": baseline_path, + })) + assert sr["ok"] is True + + sim = unwrap(await s.call_tool("run_simulation", { + "osm_path": baseline_path, + "epw_path": EPW_PATH, + })) + assert sim["ok"] is True, f"baseline run_simulation failed: {sim}" + baseline_run_id = sim["run_id"] + + status = await poll_until_done(s, baseline_run_id) + assert status["run"]["status"] == "success", f"Baseline sim failed: {status}" + + # Step 4: Verify baseline has annual results (required for compare_runs) + baseline_metrics = unwrap(await s.call_tool("extract_summary_metrics", { + "run_id": baseline_run_id, + })) + assert baseline_metrics["ok"] is True + b_metrics = baseline_metrics.get("metrics", baseline_metrics) + baseline_eui = b_metrics.get("eui_kBtu_ft2") + assert baseline_eui is not None, ( + "Baseline simulation must produce annual EUI results; " + f"got metrics keys: {list(b_metrics.keys())}" + ) + assert baseline_eui > 0, f"Baseline EUI should be positive, got {baseline_eui}" + + # ---------------------------------------------------------------- + # ECM 1: High-R wall insulation + # Create R-20 (IP) single-layer construction: + # R-20 IP = 3.52 m²·K/W → thickness = R×k = 3.52×0.04 = 0.141 m + # ---------------------------------------------------------------- + mat = unwrap(await s.call_tool("create_standard_opaque_material", { + "name": "R20_Insulation", + "thickness_m": 0.141, + "conductivity_w_m_k": 0.04, + "density_kg_m3": 30.0, + "specific_heat_j_kg_k": 1000.0, + })) + assert mat["ok"] is True, f"create_standard_opaque_material failed: {mat}" + + con = unwrap(await s.call_tool("create_construction", { + "name": "High_R_Wall", + "material_names": ["R20_Insulation"], + })) + assert con["ok"] is True, f"create_construction failed: {con}" + + # Verify the construction R-value before applying: + # R-20 IP = 3.52 m²·K/W → thickness=0.141 m, k=0.04 W/m·K + con_details = unwrap(await s.call_tool("get_construction_details", { + "construction_name": "High_R_Wall", + })) + assert con_details["ok"] is True + layers = con_details["construction"]["layers"] + assert len(layers) == 1, f"Expected 1 layer, got {len(layers)}" + layer = layers[0] + assert abs(layer["thickness_m"] - 0.141) < 0.001, ( + f"R-20 insulation thickness should be ~0.141 m, got {layer['thickness_m']}" + ) + assert abs(layer["conductivity_w_m_k"] - 0.04) < 0.001, ( + f"R-20 insulation conductivity should be 0.04 W/m·K, got {layer['conductivity_w_m_k']}" + ) + + # Find all exterior walls and apply new construction + surfs = unwrap(await s.call_tool("list_surfaces", { + "surface_type": "Wall", + "boundary": "Outdoors", + "max_results": 0, + })) + assert surfs["ok"] is True + ext_walls = surfs["surfaces"] + assert len(ext_walls) > 0, "Baseline model must have exterior walls" + + for wall in ext_walls: + assign = unwrap(await s.call_tool("assign_construction_to_surface", { + "surface_name": wall["name"], + "construction_name": "High_R_Wall", + })) + assert assign["ok"] is True, ( + f"assign_construction_to_surface failed for '{wall['name']}': {assign}" + ) + + # Spot-check: verify the first wall's construction actually changed + spot_check = unwrap(await s.call_tool("get_surface_details", { + "surface_name": ext_walls[0]["name"], + })) + assert spot_check["ok"] is True + assert spot_check["surface"]["construction"] == "High_R_Wall", ( + f"Wall '{ext_walls[0]['name']}' construction not updated: " + f"got '{spot_check['surface']['construction']}'" + ) + + # ---------------------------------------------------------------- + # ECM 2: Thermostat deadband widening + # Capture cooling schedule name before and after to confirm the + # measure cloned and modified the schedules (not a no-op). + # ---------------------------------------------------------------- + zones_before = unwrap(await s.call_tool("list_thermal_zones", { + "detailed": True, + "max_results": 1, + })) + assert zones_before["ok"] is True + assert len(zones_before["thermal_zones"]) > 0 + clg_sched_before = zones_before["thermal_zones"][0].get("cooling_setpoint_schedule") + + ecm2 = unwrap(await s.call_tool("adjust_thermostat_setpoints", { + "cooling_offset_f": 2.0, + "heating_offset_f": -2.0, + })) + assert ecm2["ok"] is True, f"adjust_thermostat_setpoints failed: {ecm2}" + + zones_after = unwrap(await s.call_tool("list_thermal_zones", { + "detailed": True, + "max_results": 1, + })) + assert zones_after["ok"] is True + clg_sched_after = zones_after["thermal_zones"][0].get("cooling_setpoint_schedule") + # The measure clones schedules — the name must change + if clg_sched_before is not None: + assert clg_sched_after != clg_sched_before, ( + f"Thermostat cooling schedule was not updated by ECM 2: " + f"schedule name unchanged ('{clg_sched_before}')" + ) + + # ---------------------------------------------------------------- + # Step 5: Save retrofit model and simulate + # ---------------------------------------------------------------- + retrofit_path = f"/runs/{name}_retrofit.osm" + sr2 = unwrap(await s.call_tool("save_osm_model", { + "osm_path": retrofit_path, + })) + assert sr2["ok"] is True + + sim2 = unwrap(await s.call_tool("run_simulation", { + "osm_path": retrofit_path, + "epw_path": EPW_PATH, + })) + assert sim2["ok"] is True, f"retrofit run_simulation failed: {sim2}" + retrofit_run_id = sim2["run_id"] + + status2 = await poll_until_done(s, retrofit_run_id) + assert status2["run"]["status"] == "success", f"Retrofit sim failed: {status2}" + + # ---------------------------------------------------------------- + # Step 6: Compare runs — ECMs should reduce energy + # ---------------------------------------------------------------- + comparison = unwrap(await s.call_tool("compare_runs", { + "baseline_run_id": baseline_run_id, + "retrofit_run_id": retrofit_run_id, + })) + assert comparison["ok"] is True, f"compare_runs failed: {comparison}" + # Verify compare_runs used the correct run IDs + assert comparison["baseline"]["run_id"] == baseline_run_id + assert comparison["retrofit"]["run_id"] == retrofit_run_id + # ECM package should reduce EUI — delta must be negative + delta_eui = comparison.get("delta_eui_kBtu_ft2") + assert delta_eui is not None, ( + "compare_runs must produce a delta_eui_kBtu_ft2 when both runs have annual results" + ) + assert delta_eui < 0, ( + f"ECM package should reduce EUI: delta_eui={delta_eui:.2f} kBtu/ft² " + f"(expected negative)" + ) + + # Retrofit EUI must be lower than baseline (both ECMs reduce energy) + retro_metrics = unwrap(await s.call_tool("extract_summary_metrics", { + "run_id": retrofit_run_id, + })) + assert retro_metrics["ok"] is True + r_metrics = retro_metrics.get("metrics", retro_metrics) + retrofit_eui = r_metrics.get("eui_kBtu_ft2") + assert retrofit_eui is not None, ( + "Retrofit simulation must produce annual EUI results; " + f"got metrics keys: {list(r_metrics.keys())}" + ) + # Consistent with compare_runs delta (within floating-point tolerance) + assert retrofit_eui < baseline_eui, ( + f"ECM package should reduce EUI: " + f"baseline={baseline_eui:.2f}, retrofit={retrofit_eui:.2f} kBtu/ft²" + ) + + asyncio.run(_run()) diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py new file mode 100644 index 0000000..8ea05cd --- /dev/null +++ b/tests/test_telemetry.py @@ -0,0 +1,576 @@ +"""Unit tests for mcp_server/telemetry.py (OpenLLMetry / traceloop-sdk integration). + +These tests run without OpenStudio and without Docker. + +Validates: +- Telemetry is a no-op when TRACELOOP_BASE_URL is not set +- init_telemetry() calls McpInstrumentor().instrument() and Traceloop.init() +- init_telemetry() is idempotent and returns correct value on second call +- McpInstrumentor is only called when endpoint is configured +- traced() is a no-op when telemetry is not enabled +- traced() creates a span and marks ERROR on ok=False when telemetry is enabled +- trace_operation() creates a child span when a TracerProvider is configured +- _truncate() caps values at _MAX_ATTR_LEN +- sys.stdout is restored even when Traceloop.init() raises +- init_telemetry() handles Traceloop.init() exceptions gracefully + +Regression: these tests guard against the telemetry module breaking server +startup or silently swallowing init errors. +""" +from __future__ import annotations + +import sys +from contextlib import contextmanager +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +@contextmanager +def _reset_telemetry_module(): + """Force a clean re-import of telemetry so _TELEMETRY_INITIALIZED resets.""" + mod_name = "mcp_server.telemetry" + old = sys.modules.pop(mod_name, None) + try: + yield + finally: + sys.modules.pop(mod_name, None) + if old is not None: + sys.modules[mod_name] = old + + +def _make_in_memory_setup(): + """Return (provider, exporter, tracer) using the OTel SDK InMemorySpanExporter.""" + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + tracer = provider.get_tracer("test") + return provider, exporter, tracer + + +# --------------------------------------------------------------------------- +# init_telemetry tests +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_init_no_endpoint_returns_false(monkeypatch): + # Validates: init_telemetry returns False when TRACELOOP_BASE_URL is unset, and does NOT call McpInstrumentor or Traceloop.init. + monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False) + + mock_traceloop = MagicMock() + mock_instrumentor = MagicMock() + + with _reset_telemetry_module(): + with patch.dict("sys.modules", { + "traceloop": MagicMock(), + "traceloop.sdk": mock_traceloop, + "traceloop.sdk.decorators": MagicMock(), + "opentelemetry.instrumentation.mcp": mock_instrumentor, + }): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + result = tel.init_telemetry() + + assert result is False + mock_traceloop.Traceloop.init.assert_not_called() + # McpInstrumentor should NOT be called when no endpoint is set + mock_instrumentor.McpInstrumentor.assert_not_called() + + +@pytest.mark.unit +def test_init_no_sdk_returns_false(monkeypatch): + # Validates: init_telemetry returns False (no warning) when SDK is absent and no endpoint is configured. + monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False) + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = False + result = tel.init_telemetry() + + assert result is False + + +@pytest.mark.unit +def test_init_sdk_missing_with_endpoint_logs_warning(monkeypatch, caplog): + # Validates: a warning is logged when endpoint is set but SDK is not installed. + import logging + monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318") + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = False + with caplog.at_level(logging.WARNING, logger="mcp_server.telemetry"): + tel.init_telemetry() + + assert any("traceloop-sdk is not installed" in r.message for r in caplog.records) + + +@pytest.mark.unit +def test_init_instruments_mcp_and_calls_traceloop_init(monkeypatch): + # Validates: when endpoint is set, McpInstrumentor().instrument() and Traceloop.init() are both called. + monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318") + monkeypatch.setenv("OTEL_SERVICE_NAME", "test-svc") + + mock_traceloop_class = MagicMock() + mock_instrumentor_class = MagicMock() + mock_instrumentor_instance = MagicMock() + mock_instrumentor_class.return_value = mock_instrumentor_instance + + mock_otel_mcp_mod = MagicMock() + mock_otel_mcp_mod.McpInstrumentor = mock_instrumentor_class + + mock_traceloop_mod = MagicMock() + mock_traceloop_mod.Traceloop = mock_traceloop_class + + with _reset_telemetry_module(): + with patch.dict("sys.modules", { + "traceloop": MagicMock(), + "traceloop.sdk": mock_traceloop_mod, + "traceloop.sdk.decorators": MagicMock(), + "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod, + }): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + result = tel.init_telemetry() + + assert result is True + mock_instrumentor_instance.instrument.assert_called_once() + mock_traceloop_class.init.assert_called_once() + _, kwargs = mock_traceloop_class.init.call_args + assert kwargs["app_name"] == "test-svc" + assert kwargs["api_endpoint"] == "http://localhost:4318" + + +@pytest.mark.unit +def test_init_idempotent(monkeypatch): + # Validates: calling init_telemetry twice only initializes once. + monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318") + + mock_traceloop_class = MagicMock() + mock_instrumentor_class = MagicMock() + mock_instrumentor_instance = MagicMock() + mock_instrumentor_class.return_value = mock_instrumentor_instance + + mock_otel_mcp_mod = MagicMock() + mock_otel_mcp_mod.McpInstrumentor = mock_instrumentor_class + + mock_traceloop_mod = MagicMock() + mock_traceloop_mod.Traceloop = mock_traceloop_class + + with _reset_telemetry_module(): + with patch.dict("sys.modules", { + "traceloop": MagicMock(), + "traceloop.sdk": mock_traceloop_mod, + "traceloop.sdk.decorators": MagicMock(), + "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod, + }): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + r1 = tel.init_telemetry() + r2 = tel.init_telemetry() + + assert mock_traceloop_class.init.call_count == 1 + assert r1 is True + assert r2 is True + + +@pytest.mark.unit +def test_init_idempotent_returns_false_when_disabled(monkeypatch): + # Validates: second call returns False (not True) when first call disabled telemetry. + monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False) + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + r1 = tel.init_telemetry() + r2 = tel.init_telemetry() + + assert r1 is False + assert r2 is False + + +@pytest.mark.unit +def test_init_disable_batch_flag(monkeypatch): + # Validates: OTEL_EXPORT_BATCH=false sets disable_batch=True in Traceloop.init. + monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318") + monkeypatch.setenv("OTEL_EXPORT_BATCH", "false") + + mock_traceloop_class = MagicMock() + mock_traceloop_mod = MagicMock() + mock_traceloop_mod.Traceloop = mock_traceloop_class + mock_otel_mcp_mod = MagicMock() + mock_otel_mcp_mod.McpInstrumentor.return_value = MagicMock() + + with _reset_telemetry_module(): + with patch.dict("sys.modules", { + "traceloop": MagicMock(), + "traceloop.sdk": mock_traceloop_mod, + "traceloop.sdk.decorators": MagicMock(), + "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod, + }): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + tel.init_telemetry() + + _, kwargs = mock_traceloop_class.init.call_args + assert kwargs["disable_batch"] is True + + +@pytest.mark.unit +def test_init_restores_stdout_on_exception(monkeypatch): + # Validates: sys.stdout is restored even when Traceloop.init() raises. + monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318") + + mock_traceloop_class = MagicMock() + mock_traceloop_class.init.side_effect = RuntimeError("init boom") + mock_traceloop_mod = MagicMock() + mock_traceloop_mod.Traceloop = mock_traceloop_class + mock_otel_mcp_mod = MagicMock() + mock_otel_mcp_mod.McpInstrumentor.return_value = MagicMock() + + original_stdout = sys.stdout + + with _reset_telemetry_module(): + with patch.dict("sys.modules", { + "traceloop": MagicMock(), + "traceloop.sdk": mock_traceloop_mod, + "traceloop.sdk.decorators": MagicMock(), + "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod, + }): + import mcp_server.telemetry as tel + tel._SDK_AVAILABLE = True + result = tel.init_telemetry() + + assert result is False + assert sys.stdout is original_stdout + + +# --------------------------------------------------------------------------- +# _truncate tests +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_truncate_short_value(): + # Validates: short values are returned unchanged. + from mcp_server.telemetry import _MAX_ATTR_LEN, _truncate + assert _truncate("hello") == '"hello"' + assert len(_truncate("hello")) < _MAX_ATTR_LEN + + +@pytest.mark.unit +def test_truncate_long_value(): + # Validates: values longer than _MAX_ATTR_LEN are capped with ellipsis. + from mcp_server.telemetry import _MAX_ATTR_LEN, _truncate + long_val = "x" * 2000 + result = _truncate(long_val) + assert len(result) <= _MAX_ATTR_LEN + 10 # small slack for the suffix + assert result.endswith("...") + + +# --------------------------------------------------------------------------- +# trace_operation tests +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_trace_operation_noop_when_no_provider(): + # Validates: trace_operation is safe to call when no provider is configured. + from mcp_server.telemetry import trace_operation + ran = [] + with trace_operation("test_op") as span: + ran.append(True) + # NonRecordingSpan.set_attribute is a no-op -- this must not crash + span.set_attribute("key", "value") + assert ran == [True] + + +@pytest.mark.unit +def test_trace_operation_noop_span_on_import_error(): + # Validates: trace_operation yields a _NoopSpan when opentelemetry is absent. + # Regression: trace_operation() must not raise ImportError in production + # environments where dev extras (opentelemetry-api) are not installed. + import sys + + from mcp_server.telemetry import _NoopSpan, trace_operation + + # Simulate opentelemetry being absent + otel_keys = {"opentelemetry", "opentelemetry.trace"} + with patch.dict(sys.modules, dict.fromkeys(otel_keys, None)): + ran = [] + with trace_operation("test_noop") as span: + ran.append(True) + assert isinstance(span, _NoopSpan) + span.set_attribute("key", "value") + span.set_status("ok") + span.record_exception(None) + assert ran == [True] + + +@pytest.mark.unit +def test_trace_operation_child_span(): + # Validates: trace_operation creates a named child span when a TracerProvider is configured. + from opentelemetry import trace as otel_trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + + from mcp_server.telemetry import trace_operation + with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")): + with trace_operation("my_op", {"key": "val"}): + pass + + spans = exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "my_op" + assert spans[0].attributes.get("key") == '"val"' + + +@pytest.mark.unit +def test_trace_operation_records_exception(): + # Validates: trace_operation sets ERROR status when an exception is raised. + from opentelemetry import trace as otel_trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + from opentelemetry.trace import StatusCode + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + + from mcp_server.telemetry import trace_operation + with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")): + with pytest.raises(ValueError): + with trace_operation("failing_op"): + raise ValueError("boom") + + spans = exporter.get_finished_spans() + assert spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# traced() decorator tests +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_traced_noop_when_telemetry_disabled(): + # Validates: traced() wrapper calls the original function directly when _TELEMETRY_ENABLED is False (SDK installed but no endpoint configured). + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._TELEMETRY_ENABLED = False + + call_log: list[str] = [] + + def my_fn(x: int) -> dict: + call_log.append("called") + return {"ok": True, "value": x} + + wrapped = tel.traced()(my_fn) + result = wrapped(42) + + assert result == {"ok": True, "value": 42} + assert call_log == ["called"] + + +@pytest.mark.unit +def test_traced_creates_span_when_enabled(): + # Validates: traced() creates a span via trace_operation when _TELEMETRY_ENABLED is True. + from opentelemetry import trace as otel_trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._TELEMETRY_ENABLED = True + + @tel.traced(op_name="custom_name") + def my_fn() -> dict: + return {"ok": True} + + with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")): + result = my_fn() + + assert result == {"ok": True} + spans = exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "custom_name" + + +@pytest.mark.unit +def test_traced_marks_error_on_ok_false(): + # Validates: traced() marks the span ERROR when result has ok=False. + from opentelemetry import trace as otel_trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + from opentelemetry.trace import StatusCode + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._TELEMETRY_ENABLED = True + + @tel.traced() + def failing_op() -> dict: + return {"ok": False, "error": "something failed"} + + with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")): + result = failing_op() + + assert result == {"ok": False, "error": "something failed"} + spans = exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].status.status_code == StatusCode.ERROR + assert spans[0].attributes.get("error.message") == "something failed" + + +@pytest.mark.unit +def test_traced_uses_function_name_as_default_span_name(): + # Validates: traced() uses the function name when op_name is not specified. + from opentelemetry import trace as otel_trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + + exporter = InMemorySpanExporter() + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + + with _reset_telemetry_module(): + import mcp_server.telemetry as tel + tel._TELEMETRY_ENABLED = True + + @tel.traced() + def run_simulation(osm_path: str) -> dict: + return {"ok": True} + + with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")): + run_simulation("/tmp/test.osm") + + spans = exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "run_simulation" + + +# --------------------------------------------------------------------------- +# Startup wiring regression — init_telemetry() must appear before FastMCP +# is instantiated in server.main(). Checked via AST so this runs without +# importing server.py (which requires /runs to exist and openstudio). +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_server_main_calls_init_telemetry_before_fastmcp(): + # Regression: init_telemetry() must be called before FastMCP is + # instantiated inside server.main(). If the order is reversed, + # McpInstrumentor cannot patch FastMCP.__init__ and all auto- + # instrumentation silently stops working. + import ast + from pathlib import Path + + server_src = (Path(__file__).parent.parent / "mcp_server" / "server.py").read_text() + tree = ast.parse(server_src) + + main_fn = next( + (n for n in ast.walk(tree) + if isinstance(n, ast.FunctionDef) and n.name == "main"), + None, + ) + assert main_fn is not None, "main() not found in mcp_server/server.py" + + init_telemetry_line = None + fastmcp_line = None + + for node in ast.walk(main_fn): + if ( + isinstance(node, ast.Expr) + and isinstance(node.value, ast.Call) + and isinstance(node.value.func, ast.Name) + and node.value.func.id == "init_telemetry" + ): + init_telemetry_line = node.lineno + + if isinstance(node, ast.Call): + func = node.func + if (isinstance(func, ast.Name) and func.id == "FastMCP") or ( + isinstance(func, ast.Attribute) and func.attr == "FastMCP" + ): + fastmcp_line = node.lineno + + assert init_telemetry_line is not None, ( + "init_telemetry() call not found in server.main(). " + "It must be called before FastMCP is instantiated." + ) + assert fastmcp_line is not None, "FastMCP() instantiation not found in server.main()." + assert init_telemetry_line < fastmcp_line, ( + f"init_telemetry() (line {init_telemetry_line}) must come BEFORE " + f"FastMCP() (line {fastmcp_line}) in server.main(). " + "McpInstrumentor patches FastMCP.__init__ during init_telemetry(); " + "reversing the order silently disables all auto-instrumentation." + ) + + +# --------------------------------------------------------------------------- +# Decorator coverage regression — @traced() must remain on all promised ops. +# Checked via AST to avoid importing skill modules that require openstudio. +# --------------------------------------------------------------------------- + +@pytest.mark.unit +def test_traced_decorator_applied_to_promised_operations(): + # Regression: the CHANGELOG and README both promise that specific operations + # emit spans. This test guards against accidental decorator removal by + # inspecting the source AST of each operations file. + import ast + from pathlib import Path + + repo_root = Path(__file__).parent.parent + expected = [ + ("mcp_server/skills/simulation/operations.py", "run_simulation"), + ("mcp_server/skills/measures/operations.py", "apply_measure"), + ("mcp_server/skills/measure_authoring/operations.py", "create_measure_op"), + ("mcp_server/skills/comstock/operations.py", "create_typical_building"), + ("mcp_server/skills/comstock/operations.py", "create_bar_building"), + ("mcp_server/skills/comstock/operations.py", "create_new_building"), + ("mcp_server/skills/common_measures/wrappers.py", "run_qaqc_checks_op"), + ] + + missing = [] + for rel_path, func_name in expected: + src = (repo_root / rel_path).read_text() + tree = ast.parse(src) + for node in ast.walk(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name: + has_traced = any( + (isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "traced") + or (isinstance(d, ast.Call) and isinstance(d.func, ast.Attribute) and d.func.attr == "traced") + for d in node.decorator_list + ) + if not has_traced: + missing.append(f"{rel_path}::{func_name}") + break + else: + missing.append(f"{rel_path}::{func_name} (function not found)") + + assert not missing, ( + f"The following operations are missing @traced() decoration: {missing}. " + "Every operation listed in the CHANGELOG/README tracing section must be " + "wrapped with @traced() so it emits a span when telemetry is enabled." + )