diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..68ff88b
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,27 @@
+# openstudio-mcp environment variable template.
+# Copy to .env and fill in your values — .env is gitignored and must never be committed.
+
+# ---------------------------------------------------------------------------
+# Telemetry (optional — requires pip install 'openstudio-mcp[telemetry]')
+# Leave TRACELOOP_BASE_URL unset to disable tracing entirely (zero overhead).
+# ---------------------------------------------------------------------------
+
+# OTLP HTTP endpoint.  Examples:
+#   Local Jaeger:       http://localhost:4318
+#   Traceloop cloud:    https://api.traceloop.com
+TRACELOOP_BASE_URL=
+
+# API key — required only for Traceloop cloud, not for generic OTLP backends.
+TRACELOOP_API_KEY=
+
+# Service name shown on every span.
+OTEL_SERVICE_NAME=openstudio-mcp
+
+# Set to "false" to use synchronous span export (useful in development).
+OTEL_EXPORT_BATCH=true
+
+# IMPORTANT PRIVACY SETTING: when "true" (default), tool arguments and outputs
+# — including file paths, model parameters, and simulation results — are
+# exported to the OTLP backend.  Set to "false" unless you have reviewed the
+# data being exported and your backend is self-hosted or trusted.
+TRACELOOP_TRACE_CONTENT=false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ba4cf19..0fee907 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,6 +26,15 @@ jobs:
           mkdir -p runs
           docker run --rm -v "$PWD:/repo" -v "$PWD/runs:/runs" openstudio-mcp:dev bash -lc 'cd /repo && pytest -vv -m "not integration"'
 
+      - name: Smoke-install telemetry extra
+        # Validates that traceloop-sdk and its deps install cleanly from the
+        # pinned constraint in [telemetry]. Catches packaging drift that would
+        # make openstudio-mcp[telemetry] uninstallable without needing a full
+        # Docker rebuild.
+        run: |
+          docker run --rm -v "$PWD:/repo" openstudio-mcp:dev bash -lc \
+            'pip install --quiet -e "/repo[telemetry]" && python -c "from traceloop.sdk import Traceloop; print(\"traceloop-sdk OK\")"'
+
       - name: Save Docker image
         run: docker save openstudio-mcp:dev | gzip > /tmp/image.tar.gz
 
@@ -66,8 +75,8 @@ jobs:
               EXTRA_ENV="-e MCP_OSW_PATH=tests/assets/SEB_model/SEB4_baseboard/workflow.osw -e EXPECTED_EUI=1.8750760248144998 -e EXPECTED_EUI_RTOL=0.02 -e EXPECTED_EUI_ATOL=0.0"
               ;;
             2)
-              # common_measures, hvac_systems, geometry, zone terminal, skill_energy_report
-              FILES="tests/test_common_measures.py tests/test_hvac_systems.py tests/test_replace_zone_terminal.py tests/test_geometry.py tests/test_skill_energy_report.py"
+              # common_measures, hvac_systems, geometry, zone terminal, skill_energy_report, ecm_package
+              FILES="tests/test_common_measures.py tests/test_hvac_systems.py tests/test_replace_zone_terminal.py tests/test_geometry.py tests/test_skill_energy_report.py tests/test_skill_ecm_package.py"
               EXTRA_ENV=""
               ;;
             3)
@@ -82,7 +91,7 @@ jobs:
               ;;
             5)
               # HVAC supply sim smoke tests + hvac_validation + bar_building + concurrent regression
-              FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py tests/test_concurrent_tools.py tests/test_stdout_logger_silence.py"
+              FILES="tests/test_hvac_supply_sim.py tests/test_hvac_validation.py tests/test_bar_building.py tests/test_concurrent_tools.py tests/test_stdout_logger_silence.py tests/test_telemetry.py"
               EXTRA_ENV=""
               ;;
           esac
diff --git a/.gitignore b/.gitignore
index 8396c12..7c70477 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,5 +33,11 @@ Thumbs.db
 # Code review artifacts
 docs/review/
 
+# Environment / secrets
+.env
+.env.*
+!.env.example
+
 # Codex CLI
 .codex/
+.mcp.json
diff --git a/.mcp.json.example b/.mcp.json.example
new file mode 100644
index 0000000..d69202c
--- /dev/null
+++ b/.mcp.json.example
@@ -0,0 +1,15 @@
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/ABSOLUTE/PATH/TO/inputs:/inputs",
+        "-v", "/ABSOLUTE/PATH/TO/runs:/runs",
+        "-v", "/ABSOLUTE/PATH/TO/openstudio-mcp/.claude/skills:/skills:ro",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 301c256..888399c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+- **Optional OpenLLMetry tracing**: `pip install 'openstudio-mcp[telemetry]'` + `TRACELOOP_BASE_URL` env var enables distributed tracing via traceloop-sdk. Zero overhead when unset. Key operations (`run_simulation`, `apply_measure`, `create_measure`, `create_*_building`, `run_qaqc_checks`) emit named spans; every FastMCP tool call is auto-instrumented via `McpInstrumentor`.
+- **Per-client setup guides**: `docs/clients/` — detailed MCP config examples, tool limits, and performance notes for Claude Code, Claude Desktop, VS Code Copilot, Windsurf, Gemini CLI, and Cursor.
+- **Token context performance doc**: `docs/clients/token-context-performance.md` — benchmark of how each client handles the 142-tool surface and context overhead.
+- **SECURITY.md**: disclosure policy and supported versions.
+- **ECM package example**: `docs/examples/20_deep_retrofit_package.md` — wall insulation + thermostat + window + PV stack with expected EUI ranges.
+- **`.mcp.json.example`**: ready-to-use Claude Code MCP config.
+- **Docker tracing stack**: `docker/docker-compose.tracing.yml` + `docker/otel-collector-config.yaml` for local Jaeger/OTEL collector.
+- **`test_telemetry.py`**: 20 unit tests for telemetry module (no Docker required) — includes startup-wiring and decorator-coverage regression tests.
+- **`.env.example`**: template for telemetry environment variables with privacy guidance.
+- **`test_stdout_logger_silence.py`**: integration tests verifying Polyhedron/Space Logger warnings are fully suppressed after `silence_openstudio_stdout_logger()`.
+
+### Fixed
+- **ECM package example**: window ECM was incorrectly using `create_standard_opaque_material`; now correctly notes that glazing requires `SimpleGlazing` authored via `create_measure`.
+- **README tracing Docker example**: corrected image tag from `openstudio-mcp:dev` to `openstudio-mcp:tracing` (the dev image does not include traceloop-sdk); added build command and explanatory note.
+- **`TRACELOOP_TRACE_CONTENT` docs**: expanded to warn that the default (`true`) exports tool arguments and outputs to the OTLP backend; recommends `false` as the safe starting point.
+- **`opentelemetry-sdk` version constraint**: tightened `[dev]` lower bound from `>=1.20` to `>=1.38.0` to match `traceloop-sdk`'s actual minimum; prevents pip resolving an incompatible version when both extras are installed.
+
 ## [0.9.0] - 2026-04-10
 
 ### Added
diff --git a/README.md b/README.md
index 05e406c..ad48b70 100644
--- a/README.md
+++ b/README.md
@@ -106,19 +106,21 @@ For simulation outputs (results, SQL, HTML reports), these are already in `/runs
 
 ### Other MCP Hosts
 
-[VS Code Copilot](https://code.visualstudio.com/), [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Windsurf](https://windsurf.com/), and [Gemini CLI](https://github.com/google-gemini/gemini-cli) also support MCP with similar JSON config. See the [MCP documentation](https://modelcontextprotocol.io/quickstart/user) for host-specific setup.
+See **[`docs/clients/`](docs/clients/index.md)** for per-client setup guides with config files, tool limits, and performance notes.
 
 ### Client Compatibility
 
-| Client | Status | Notes |
-|--------|--------|-------|
-| Claude Desktop | Full support | All 142 tools available |
-| Claude Code | Full support | ToolSearch auto-defers tools for efficient discovery |
-| VS Code Copilot | Compatible | MCP support via config |
-| Windsurf | Compatible | Under 100-tool limit |
-| Gemini CLI | Compatible | Use includeTools/excludeTools if needed |
-| Cursor | Not compatible | 40-tool hard cap — use Windsurf or Claude Code instead |
-| OpenAI API | Compatible | Use defer_loading for best results |
+| Client | Tool Limit | Status | Guide |
+|--------|-----------|--------|-------|
+| Claude Code | Unlimited (ToolSearch) | ✅ Best | [claude-code.md](docs/clients/claude-code.md) |
+| Claude Desktop | ~100 practical | ✅ Full | [claude-desktop.md](docs/clients/claude-desktop.md) |
+| VS Code Copilot | 128 hard | ✅ Full | [vs-code-copilot.md](docs/clients/vs-code-copilot.md) |
+| Windsurf | 100 hard | ⚠️ Partial | [windsurf.md](docs/clients/windsurf.md) — manual tool selection required |
+| Gemini CLI | 100 soft / 512 API | ⚠️ Partial | [gemini-cli.md](docs/clients/gemini-cli.md) — use `includeTools` |
+| Cursor | 40 hard | ❌ Incompatible | [cursor.md](docs/clients/cursor.md) — 40-tool cap |
+| OpenAI API | 128 (recommends ~10) | ✅ Compatible | Use `defer_loading` for best results |
+
+See [token context & performance](docs/clients/token-context-performance.md) for a breakdown of how each client handles the 142-tool surface.
 
 ---
 
@@ -532,6 +534,40 @@ In **prod mode**, stdout is reserved exclusively for MCP JSON-RPC messages. Logs
 
 ---
 
+## Tracing (OpenLLMetry)
+
+Distributed tracing via [traceloop-sdk](https://github.com/traceloop/openllmetry) is available as an optional extra. Install it, then set `TRACELOOP_BASE_URL` to enable it:
+
+```bash
+pip install 'openstudio-mcp[telemetry]'
+```
+
+Or with Docker (requires the tracing image built with `--build-arg TELEMETRY=1`):
+
+```bash
+# Build the tracing-enabled image once:
+docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile .
+
+# Run with tracing enabled:
+docker run --rm -i \
+  -e TRACELOOP_BASE_URL=http://host.docker.internal:4318 \
+  openstudio-mcp:tracing openstudio-mcp
+```
+
+The standard `openstudio-mcp:dev` image does **not** include `traceloop-sdk`. Using it with `TRACELOOP_BASE_URL` set will log a warning and disable tracing — it will not work silently.
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `TRACELOOP_BASE_URL` | *(unset — disabled)* | OTLP endpoint, e.g. `http://localhost:4318` or `https://api.traceloop.com` |
+| `TRACELOOP_API_KEY` | *(unset)* | API key for Traceloop cloud (not needed for generic OTLP) |
+| `OTEL_SERVICE_NAME` | `openstudio-mcp` | Service name on every span |
+| `OTEL_EXPORT_BATCH` | `true` | Set `false` for synchronous export in development |
+| `TRACELOOP_TRACE_CONTENT` | `true` | **Set `false` to protect privacy** — when `true`, tool arguments and outputs (including file paths and model data) are exported to the OTLP backend. Recommended: start with `false` and enable only if your backend is self-hosted or you have reviewed the data. |
+
+Tracing is **off by default** and has zero overhead when `TRACELOOP_BASE_URL` is unset. Key operations (`run_simulation`, `apply_measure`, `create_measure`, the three `create_*_building` variants, and `run_qaqc_checks`) emit named spans. Every FastMCP tool call is auto-instrumented via `McpInstrumentor`.
+
+---
+
 ## Architecture
 
 - **Transport:** stdio (container spawned by host)
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..3c95ef7
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,108 @@
+# Security Policy
+
+## Scope
+
+This document covers the `openstudio-mcp` MCP server — a container-bound process that gives
+AI agents programmatic control of building energy models via the OpenStudio SDK.
+
+---
+
+## Path Safety
+
+### Allowed Roots
+
+All file operations (`read_file`, `copy_file`, `run_osw`, `validate_osw`, etc.) are restricted
+to a fixed set of allowed path roots enforced by `is_path_allowed()` in `mcp_server/config.py`:
+
+| Root | Default | Env Override |
+|---|---|---|
+| `/runs` | Run outputs, simulation artifacts | `OPENSTUDIO_MCP_RUN_ROOT` |
+| `/inputs` | User-provided models and weather files | `OPENSTUDIO_MCP_INPUT_ROOT` |
+| `/repo` | Server source code (read-only use cases) | — |
+| Bundled measures dirs | ComStock and common measures | — |
+| Skills dir | Skill Markdown guides | — |
+
+Any path that resolves (after symlink expansion) outside these roots is rejected with
+`{"ok": false, "error": "invalid_path"}`. Symlink traversal is prevented by calling
+`Path.resolve()` before comparison.
+
+### Path Traversal Mitigations
+
+| Attack Vector | Mitigation |
+|---|---|
+| `../../etc/passwd` in `file_path` | `Path.resolve()` + allowlist check |
+| `../../etc` in `copy_file` `destination` | Same: both source and destination validated |
+| `../model.osm` in `seed_file` (OSW) | Flattened to `basename` before staging into run dir |
+| Symlink escape from `/runs` | `resolve()` follows symlinks before allowlist check |
+
+### What Is Not Protected
+
+- **Denial of service** via large file reads: `read_file` defaults to 50 KB (`max_bytes=50_000`)
+  but callers can override `max_bytes`. No upper-bound cap is enforced — consider adding one
+  if exposing this server to untrusted clients.
+- **EnergyPlus subprocess**: The simulation runner (`run_simulation`, `run_osw`) invokes
+  `openstudio run` as a subprocess. The OSM/OSW content is caller-controlled; a malicious model
+  could cause unexpected EnergyPlus behavior. The container boundary is the primary mitigation.
+
+---
+
+## Container Isolation
+
+The server is designed to run inside a Docker container with explicit volume mounts:
+
+```
+docker run --rm \
+  -v "/path/to/models:/inputs" \
+  -v "/path/to/outputs:/runs" \
+  openstudio-mcp:latest
+```
+
+- The host filesystem is **not mounted** except for the two explicit volumes.
+- By default, the server performs no outbound network calls; OpenStudio/EnergyPlus
+  are fully offline. **Exception:** when `TRACELOOP_BASE_URL` is set, the server
+  exports traces to that OTLP endpoint.
+- The server process runs as the user defined by the container runtime. The repo
+  Dockerfile does not set a `USER` instruction — it runs as root by default.
+  Production deployments should add a non-root `USER` in a derived image.
+
+---
+
+## Stdout / MCP Transport Integrity
+
+OpenStudio's SWIG bindings emit log warnings to C stdout. This would corrupt the JSON-RPC
+transport (MCP communicates over stdio). Two mitigations are applied at startup in `server.py`:
+
+1. `silence_openstudio_stdout_logger()` — sets OpenStudio's standard-out logger to `Fatal`
+   level, suppressing operational warnings.
+2. `redirect_c_stdout_to_stderr()` — permanently redirects C-level stdout (fd 1) to stderr,
+   with Python's `sys.stdout` on a private pipe to the MCP client. This is a backstop for
+   any C-extension output that bypasses the logger.
+
+These mitigations prevent log injection into the MCP JSON-RPC stream.
+
+---
+
+## Reporting a Vulnerability
+
+Please **do not** open a public GitHub issue for security vulnerabilities.
+
+Email the maintainers directly or use GitHub's
+[private security advisory](https://github.com/settings/security-advisories) feature. Include:
+
+- A description of the vulnerability and its impact
+- Steps to reproduce (minimal repro preferred)
+- Affected versions or commits
+
+We aim to acknowledge reports within 72 hours and provide a fix or mitigation within 14 days
+for confirmed issues.
+
+---
+
+## Known Limitations / Out of Scope
+
+- **Authentication / authorization**: The MCP server has no built-in auth. Access control is
+  the responsibility of the MCP client and the host environment.
+- **EnergyPlus model content**: The server executes whatever EnergyPlus model the caller
+  provides. Malicious model content is an EnergyPlus concern, not an MCP server concern.
+- **Multi-tenancy**: The server holds a single shared in-memory model. It is not designed for
+  simultaneous untrusted multi-user access.
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 1a4ae9d..3e600f1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -43,8 +43,13 @@ COPY pyproject.toml /repo/pyproject.toml
 COPY mcp_server /repo/mcp_server
 COPY docker /repo/docker
 
+# TELEMETRY=1 installs traceloop-sdk + opentelemetry instrumentation for MCP.
+# Default off to keep the image lean. Set to 1 for the tracing variant:
+#   docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile .
+ARG TELEMETRY=0
 RUN pip install --no-cache-dir -U pip \
-    && pip install --no-cache-dir -e ".[dev]"
+    && pip install --no-cache-dir -e ".[dev]" \
+    && if [ "$TELEMETRY" = "1" ]; then pip install --no-cache-dir -e ".[telemetry]"; fi
 
 # (Optional) If you want the container to include any other repo files too:
 # COPY . /repo
diff --git a/docker/docker-compose.tracing.yml b/docker/docker-compose.tracing.yml
new file mode 100644
index 0000000..d75f7e2
--- /dev/null
+++ b/docker/docker-compose.tracing.yml
@@ -0,0 +1,91 @@
+# OpenLLMetry / Traceloop tracing stack for openstudio-mcp
+#
+# Quick start:
+#   # 1. Build the tracing-enabled MCP image (adds traceloop-sdk):
+#   docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile .
+#
+#   # 2. Start Jaeger:
+#   docker compose -f docker/docker-compose.tracing.yml up -d
+#   open http://localhost:16686   # Jaeger UI — traces appear here
+#
+#   # 3. Configure your MCP client to use the tracing image on the shared network.
+#      Add these flags to your client's docker run command:
+#
+#   -e TRACELOOP_BASE_URL=http://jaeger:4318
+#   --network openstudio-mcp-tracing
+#   (and use openstudio-mcp:tracing instead of openstudio-mcp:dev)
+#
+# Example — Claude Code .mcp.json with tracing enabled:
+#
+#   {
+#     "mcpServers": {
+#       "openstudio-mcp": {
+#         "command": "docker",
+#         "args": [
+#           "run", "--rm", "-i",
+#           "-v", "/abs/path/inputs:/inputs",
+#           "-v", "/abs/path/runs:/runs",
+#           "--network", "openstudio-mcp-tracing",
+#           "-e", "OPENSTUDIO_MCP_MODE=prod",
+#           "-e", "TRACELOOP_BASE_URL=http://jaeger:4318",
+#           "-e", "OTEL_SERVICE_NAME=openstudio-mcp",
+#           "-e", "TRACELOOP_TRACE_CONTENT=true",
+#           "openstudio-mcp:tracing", "openstudio-mcp"
+#         ]
+#       }
+#     }
+#   }
+#
+# Environment variables understood by the MCP server (see mcp_server/telemetry.py):
+#   TRACELOOP_BASE_URL      OTLP HTTP endpoint (required to enable telemetry)
+#   TRACELOOP_API_KEY       API key for Traceloop cloud (omit for local Jaeger)
+#   OTEL_SERVICE_NAME       Service name on spans (default: openstudio-mcp)
+#   OTEL_EXPORT_BATCH       "false" for sync export in dev (default: batch)
+#   TRACELOOP_TRACE_CONTENT "false" to omit tool args from spans (privacy)
+
+services:
+  jaeger:
+    image: jaegertracing/jaeger:2.5.0
+    container_name: openstudio-mcp-jaeger
+    ports:
+      - "16686:16686"   # Jaeger UI
+      - "4317:4317"     # OTLP gRPC receiver
+      - "4318:4318"     # OTLP HTTP receiver (used by traceloop-sdk)
+    environment:
+      - SPAN_STORAGE_TYPE=memory
+    networks:
+      - tracing
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "wget", "--spider", "-q", "http://localhost:16686/"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+      start_period: 10s
+
+  # Optional: OpenTelemetry Collector as a middle layer.
+  # Useful if you want to fan-out to multiple backends (Jaeger + Prometheus + Loki).
+  # Comment out and point TRACELOOP_BASE_URL directly to jaeger:4318 for simplest setup.
+  otel-collector:
+    image: otel/opentelemetry-collector-contrib:0.120.0
+    container_name: openstudio-mcp-otelcol
+    command: ["--config=/etc/otel/config.yaml"]
+    volumes:
+      - ./otel-collector-config.yaml:/etc/otel/config.yaml:ro
+    ports:
+      - "4319:4318"   # OTLP HTTP (external port offset to avoid conflict with jaeger)
+      - "4320:4317"   # OTLP gRPC (external port offset)
+      - "8888:8888"   # Collector metrics (Prometheus scrape endpoint)
+    networks:
+      - tracing
+    depends_on:
+      jaeger:
+        condition: service_healthy
+    restart: unless-stopped
+    profiles:
+      - collector   # only start with: docker compose --profile collector up
+
+networks:
+  tracing:
+    name: openstudio-mcp-tracing
+    driver: bridge
diff --git a/docker/otel-collector-config.yaml b/docker/otel-collector-config.yaml
new file mode 100644
index 0000000..4520179
--- /dev/null
+++ b/docker/otel-collector-config.yaml
@@ -0,0 +1,59 @@
+# OpenTelemetry Collector config for openstudio-mcp tracing stack.
+# Used only when running: docker compose --profile collector up
+# For simple setups, skip this and point TRACELOOP_BASE_URL directly to jaeger:4318.
+
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: 0.0.0.0:4317
+      http:
+        endpoint: 0.0.0.0:4318
+
+processors:
+  batch:
+    timeout: 1s
+    send_batch_size: 1024
+  memory_limiter:
+    check_interval: 1s
+    limit_mib: 256
+    spike_limit_mib: 64
+  # Add service name tag from resource attributes
+  resource:
+    attributes:
+      - key: service.namespace
+        value: openstudio-mcp
+        action: insert
+
+exporters:
+  otlp/jaeger:
+    endpoint: jaeger:4317
+    tls:
+      insecure: true
+
+  # Uncomment to export to Traceloop cloud instead of / in addition to Jaeger:
+  # otlphttp/traceloop:
+  #   endpoint: https://api.traceloop.com
+  #   headers:
+  #     Authorization: "Bearer ${TRACELOOP_API_KEY}"
+
+  # Prometheus metrics from the collector itself (scrape at :8888/metrics)
+  prometheus:
+    endpoint: 0.0.0.0:8888
+
+  # Debug: print spans to collector stdout (useful for development)
+  debug:
+    verbosity: basic
+    sampling_initial: 5
+    sampling_thereafter: 200
+
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      processors: [memory_limiter, resource, batch]
+      exporters: [otlp/jaeger]
+    metrics:
+      receivers: [otlp]
+      processors: [memory_limiter, batch]
+      exporters: [prometheus]
diff --git a/docs/clients/claude-code.md b/docs/clients/claude-code.md
new file mode 100644
index 0000000..2809f33
--- /dev/null
+++ b/docs/clients/claude-code.md
@@ -0,0 +1,131 @@
+# Claude Code Setup
+
+> **Last verified:** April 2026 · Claude Code 1.x · [Docs](https://docs.anthropic.com/en/docs/claude-code/mcp)
+
+Claude Code is the **optimal client** for openstudio-mcp. Its ToolSearch feature automatically defers all 142 tools and retrieves only the 3-5 most relevant ones per turn — this eliminates context bloat and keeps accuracy high even on long multi-step workflows. No manual tool filtering is required.
+
+---
+
+## Prerequisites
+
+- **Docker Desktop** running ([download](https://www.docker.com/products/docker-desktop/))
+- **Claude Code** installed: `npm install -g @anthropic-ai/claude-code`
+- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .`
+
+---
+
+## Configuration
+
+Create `.mcp.json` in your project root (or any directory you run `claude` from). A template is provided at `.mcp.json.example` in the openstudio-mcp repo. Copy it and fill in your absolute paths:
+
+```bash
+cp .mcp.json.example .mcp.json
+# Edit .mcp.json and replace /ABSOLUTE/PATH/TO/... placeholders
+```
+
+`.mcp.json` contains machine-specific absolute paths so it is gitignored by default. Share the `.mcp.json.example` template with your team instead.
+
+```json
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/your/inputs:/inputs",
+        "-v", "/absolute/path/to/your/runs:/runs",
+        "-v", "/absolute/path/to/openstudio-mcp/.claude/skills:/skills:ro",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
+```
+
+> **Tip:** Include the `.claude/skills` mount. Claude Code's ToolSearch indexes tool descriptions at connection time — the skill guides improve keyword matching for `get_skill()` and `list_skills()` calls.
+
+**Alternative: pass config path explicitly**
+
+```bash
+claude --mcp-config /path/to/mcp.json
+```
+
+---
+
+## Verification
+
+```bash
+# Confirm .mcp.json is valid and openstudio-mcp is registered
+# (run from the project directory containing .mcp.json)
+claude mcp add openstudio-mcp --scope project docker -- run --rm -i \
+  -v "/absolute/path/to/inputs:/inputs" \
+  -v "/absolute/path/to/runs:/runs" \
+  -e OPENSTUDIO_MCP_MODE=prod openstudio-mcp:dev openstudio-mcp
+# → prints "MCP server openstudio-mcp already exists in .mcp.json" if it's registered
+```
+
+> **Note:** `claude mcp list` shows only user-scope servers. Project-scope `.mcp.json` servers load when you start an interactive `claude` session from that directory — they won't appear in `mcp list`.
+
+Start a session and test:
+
+```bash
+# Start Claude Code in the project directory
+cd /path/to/your/project
+claude
+
+# At the prompt:
+> list_skills
+```
+
+A successful response shows available skill categories. Claude Code will use ToolSearch to find and load only the relevant tool schemas for each request.
+
+---
+
+## How ToolSearch Works
+
+When the total tool schema size exceeds 10% of the model's context window, Claude Code automatically defers all tools and exposes only a `mcp__openstudio-mcp__search_tools` search endpoint. The workflow becomes:
+
+1. Your prompt arrives
+2. Claude Code searches for relevant tools by keyword
+3. 3–5 matching tool schemas are loaded into context
+4. The tool is called with correct parameters
+5. Repeat as needed
+
+This means openstudio-mcp's 142 tools behave as if there were only 5 at any given moment. **ToolSearch indexes at Docker image build time** — always rebuild the image after adding new tools (`docker build`).
+
+### Why ToolSearch Accuracy Depends on Descriptions
+
+ToolSearch uses BM25/regex matching on tool names and descriptions. Vague prompts ("add HVAC") depend on description keywords to route correctly. The skills system supplements this — calling `list_skills()` and `get_skill("add-hvac")` gives Claude a step-by-step guide that bypasses tool ambiguity entirely.
+
+---
+
+## First Prompts
+
+```
+Simple:   "Create an example model and describe its thermal zones"
+Medium:   "Follow the new-building skill to create a 5-story office in Boston"
+Advanced: "Load /inputs/baseline.osm, run a simulation, and show me the EUI breakdown by end use"
+```
+
+---
+
+## Context & Performance Notes
+
+ToolSearch reduces per-turn schema overhead from ~15K tokens (all 142 loaded) to ~1K tokens (3-5 tools loaded). This is the primary reason Claude Code is the recommended client — see [Token Context & Performance](./token-context-performance.md) for numbers.
+
+Observed benchmark (3-model sweep, 180 tests, zero retries):
+- Sonnet: 94.4% pass rate, avg 1.9 ToolSearch calls/test
+- Opus: 94.4% pass rate, avg 2.0 ToolSearch calls/test
+- Haiku: 88.9% pass rate (does not use ToolSearch; reasons directly from tool list)
+
+---
+
+## Troubleshooting
+
+| Symptom | Likely cause | Fix |
+|---------|-------------|-----|
+| ToolSearch returns "No matching tools found" | Image not rebuilt after tool additions | `docker build -t openstudio-mcp:dev -f docker/Dockerfile .` |
+| Tools work in one session, not another | `.mcp.json` not in working directory | Check `pwd` matches where `.mcp.json` lives |
+| Claude generates Python scripts instead of using tools | ToolSearch not finding MCP tools | Rebuild image; check descriptions include relevant keywords |
+| Long workflows lose model state | In-memory model cleared between `claude` sessions | Save model to `/runs/` at end of each session with `save_osm_model` |
diff --git a/docs/clients/claude-desktop.md b/docs/clients/claude-desktop.md
new file mode 100644
index 0000000..de104f0
--- /dev/null
+++ b/docs/clients/claude-desktop.md
@@ -0,0 +1,108 @@
+# Claude Desktop Setup
+
+> **Last verified:** April 2026 · Claude Desktop 0.10+ · [Docs](https://support.anthropic.com/en/articles/9517730-getting-started-with-claude-desktop)
+
+Claude Desktop is the recommended starting point for openstudio-mcp. It has a GUI, supports all 142 tools, and handles the full skill workflow. The main limitation is that all tool schemas load into context upfront — above ~100 tools, you may notice the model spending more tokens on tool routing before the first useful response.
+
+---
+
+## Prerequisites
+
+- **Docker Desktop** running ([download](https://www.docker.com/products/docker-desktop/))
+- **Claude Desktop** installed ([download](https://claude.ai/download))
+- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .`
+
+---
+
+## Configuration
+
+Open the Claude Desktop config file:
+
+| OS | Path |
+|----|------|
+| macOS | `~/Library/Application Support/Claude/claude_desktop_config.json` |
+| Windows | `%APPDATA%\Claude\claude_desktop_config.json` |
+
+Add the `openstudio-mcp` entry to the `mcpServers` block. Replace the placeholder paths with **absolute paths** on your machine:
+
+```json
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/your/inputs:/inputs",
+        "-v", "/absolute/path/to/your/runs:/runs",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
+```
+
+**Optional: include skill guides** (enables `list_skills()` / `get_skill()` tools):
+
+```json
+"-v", "/absolute/path/to/openstudio-mcp/.claude/skills:/skills:ro",
+```
+
+---
+
+## Verification
+
+1. **Restart Claude Desktop** after saving the config
+2. Look for the **hammer icon (🔨)** in the chat input bar — it appears when at least one MCP server is connected
+3. Click the hammer icon to see all available tools listed under "openstudio-mcp"
+4. Send this test prompt:
+
+   > *"Call list_skills and tell me what skill categories are available."*
+
+   A successful response lists the skill categories (geometry, HVAC, simulation, etc.). If you see a generic response or an error, check the troubleshooting section below.
+
+---
+
+## First Prompts
+
+Try these in order of complexity:
+
+```
+Simple:   "Create an example model and tell me about it"
+Medium:   "Create a small office building with ASHRAE System 3 and show me the HVAC components"
+Advanced: "Load my model at /inputs/MyBuilding.osm, apply the 90.1-2019 template, and run a simulation"
+```
+
+---
+
+## File Access Pattern
+
+Place your `.osm` models and weather files in the folder you mapped to `/inputs`. Claude Desktop's built-in file upload puts files into an Analysis sandbox that cannot reach MCP tools — always use the `/inputs` mount instead.
+
+```bash
+# Copy your model to the inputs folder before referencing it in chat
+cp MyBuilding.osm /absolute/path/to/your/inputs/
+
+# Then reference it in the prompt
+"Analyze the building at /inputs/MyBuilding.osm"
+```
+
+---
+
+## Context & Performance Notes
+
+Claude Desktop loads all 142 tool schemas into context on the first tool call. This costs approximately **15K tokens** of your context budget upfront — see [Token Context & Performance](./token-context-performance.md) for a full breakdown.
+
+Practical effect: initial responses in a new conversation may include brief tool-selection overhead. Long conversations (15+ turns with heavy tool use) may exhaust context on complex models. If this happens, start a fresh conversation and reference `/runs/` outputs by path.
+
+---
+
+## Troubleshooting
+
+| Symptom | Likely cause | Fix |
+|---------|-------------|-----|
+| No hammer icon | Docker not running, or config JSON is invalid | Validate JSON at jsonlint.com; check `docker ps` |
+| Hammer icon but no openstudio-mcp tools | Image not found | Run `docker images` to confirm `openstudio-mcp:dev` exists |
+| `Error: volume path is not absolute` | Relative paths in config | Replace `./runs` with the full absolute path |
+| Model loaded but changes lost | `/runs` not mounted | Confirm the `-v` run path mount is in your config |
+| Upload file, tools not used | File went to Analysis sandbox | Move file to `/inputs` folder and reference by path instead |
diff --git a/docs/clients/cursor.md b/docs/clients/cursor.md
new file mode 100644
index 0000000..cfdb437
--- /dev/null
+++ b/docs/clients/cursor.md
@@ -0,0 +1,31 @@
+# Cursor — Not Compatible
+
+Cursor has a **40-tool hard cap** for MCP servers. openstudio-mcp provides 142 tools. Cursor will silently truncate the tool list to the first 40 returned by `tools/list`, which means the majority of the BEM workflow — including HVAC configuration, geometry editing, results extraction, and measure authoring — will be inaccessible.
+
+There is no supported workaround within Cursor itself (tool filtering is not user-configurable at the MCP level in current versions).
+
+---
+
+## What Happens If You Try
+
+Adding openstudio-mcp to `.cursor/mcp.json` will technically connect the server. Cursor will load the first 40 tools alphabetically from `tools/list`. Prompts that happen to use those 40 tools will work; anything requiring tools beyond position 40 will fail silently (the model will either hallucinate a response or say the operation isn't possible).
+
+---
+
+## Recommended Alternatives
+
+| Client | Why it's better for this use case |
+|--------|----------------------------------|
+| **Claude Code** | Best option: ToolSearch handles 142 tools with auto-deferral |
+| **Windsurf** | 100-tool limit; workable with manual tool selection |
+| **VS Code Copilot** | 128-tool limit; close to full coverage with minor tool disabling |
+| **Gemini CLI** | 100 soft limit; `includeTools` filter makes it manageable |
+| **Claude Desktop** | Full 142 tools; good for interactive exploration |
+
+---
+
+## If You Must Use Cursor
+
+If your workflow is confined to a narrow subset of tools (e.g., only model inspection and simulation result reading), you can curate a 40-tool subset by running a local wrapper that filters the tool list before serving it to Cursor. This is an advanced workaround and not officially supported.
+
+Track Cursor's MCP roadmap for changes to the tool cap: [Cursor MCP docs](https://docs.cursor.com/context/model-context-protocol).
diff --git a/docs/clients/gemini-cli.md b/docs/clients/gemini-cli.md
new file mode 100644
index 0000000..d368596
--- /dev/null
+++ b/docs/clients/gemini-cli.md
@@ -0,0 +1,124 @@
+# Gemini CLI Setup
+
+> **Last verified:** April 2026 · Gemini CLI 0.1.x · [Docs](https://github.com/google-gemini/gemini-cli)
+
+Gemini CLI is a terminal-based AI agent with a **1M token context window** — the largest of any supported client. This makes it well-suited for long BEM workflows and large file analysis. The soft 100-tool limit (512 via API) requires using `includeTools` to avoid degraded performance when all 142 tools are registered.
+
+---
+
+## Prerequisites
+
+- **Docker Desktop** running
+- **Gemini CLI** installed:
+  ```bash
+  npm install -g @google/gemini-cli
+  ```
+  > **Note:** The Homebrew formula (`brew install gemini-cli`) has a known dependency issue with `@google/gemini-cli-core`. Use npm.
+- **Google account** (free tier: 60 req/min, 1,000 req/day) or Gemini API key
+- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .`
+
+---
+
+## Configuration
+
+Add openstudio-mcp to `~/.gemini/settings.json`. Create the file if it doesn't exist:
+
+```json
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/your/inputs:/inputs",
+        "-v", "/absolute/path/to/your/runs:/runs",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
+```
+
+**Alternative: project-scoped config** via `GEMINI.md` in your project directory. Add a code block with the server config — Gemini CLI reads `GEMINI.md` as project context on startup.
+
+---
+
+## Managing the Tool Limit
+
+Gemini CLI has a soft limit of 100 tools in interactive mode (512 via API). With 142 tools registered, performance may degrade. Use the `includeTools` filter to expose only the tools you need:
+
+```json
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/inputs:/inputs",
+        "-v", "/absolute/path/to/runs:/runs",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ],
+      "includeTools": [
+        "list_skills", "get_skill",
+        "create_new_building", "create_bar_building", "create_typical_building",
+        "load_osm_model", "save_osm_model", "get_model_summary", "get_building_info",
+        "list_thermal_zones", "list_spaces", "list_air_loops", "list_plant_loops",
+        "add_baseline_system", "list_baseline_systems",
+        "run_simulation", "get_run_status",
+        "extract_summary_metrics", "extract_end_use_breakdown", "compare_runs",
+        "validate_model", "change_building_location",
+        "list_surfaces", "replace_window_constructions",
+        "create_measure", "test_measure", "apply_measure",
+        "generate_results_report", "recommend_tools"
+      ]
+    }
+  }
+}
+```
+
+Extend the `includeTools` list as needed. See [index.md](./index.md) for the full tool list organized by workflow.
+
+---
+
+## Verification
+
+```bash
+# Confirm openstudio-mcp is registered (no API call needed)
+gemini mcp list
+# → Should show "✓ openstudio-mcp: docker run ... (stdio) - Connected"
+
+# Start Gemini CLI and test interactively
+gemini
+> Use openstudio-mcp to list the available skills
+```
+
+---
+
+## First Prompts
+
+```
+"Create a medium office building in Chicago using openstudio-mcp and run a simulation"
+"Load the model at /inputs/baseline.osm and compare the envelope constructions"
+"Write a Ruby measure to set all lights to 8 W/m2, test it, and apply it"
+```
+
+---
+
+## Context & Performance Notes
+
+Gemini 2.0/2.5 models have a 1M token context window, so tool schema overhead (~15K tokens for all 142) is a small fraction of total capacity. Long BEM workflows with many intermediate results are well-suited to Gemini CLI's large context.
+
+However, accuracy is a function of tool count presented per turn, not total context size. Using `includeTools` to present 30–40 tools at a time keeps the model focused. See [Token Context & Performance](./token-context-performance.md).
+
+---
+
+## Troubleshooting
+
+| Symptom | Likely cause | Fix |
+|---------|-------------|-----|
+| Tools not found | Config file location wrong | Confirm `~/.gemini/settings.json` (not `.gemini/config.json`) |
+| Tool count exceeds limit warning | >100 tools registered | Add `includeTools` filter to config |
+| Slow first response | All 142 schemas loading | Add `includeTools` to reduce initial schema payload |
+| Free tier rate limit hit | >60 req/min | Upgrade to Gemini API key or reduce tool calls per workflow |
diff --git a/docs/clients/index.md b/docs/clients/index.md
new file mode 100644
index 0000000..0a31fe1
--- /dev/null
+++ b/docs/clients/index.md
@@ -0,0 +1,66 @@
+# MCP Client Setup Guide
+
+This section covers how to connect openstudio-mcp to each supported AI client, what to expect from the 142-tool surface in each environment, and how to evaluate the performance impact on your context window.
+
+---
+
+## Client Compatibility
+
+| Client | Tool Limit | Discovery | Status | Notes |
+|--------|-----------|-----------|--------|-------|
+| **Claude Code** | Unlimited | ToolSearch (auto-defer) | ✅ Best | Defers all 142 tools; retrieves 3-5 per turn by keyword |
+| **Claude Desktop** | ~100 practical | None (all in context) | ✅ Full | All tools load upfront; degradation above ~100 tools |
+| **VS Code Copilot** | 128 hard | None | ✅ Full | Requires VS Code 1.99+ with MCP support enabled |
+| **Windsurf** | 100 hard | Per-tool toggle | ⚠️ Partial | Must disable 42+ tools via UI; not plug-and-play |
+| **Gemini CLI** | 100 soft / 512 API | includeTools/excludeTools | ⚠️ Partial | Use `includeTools` to scope to a working subset |
+| **Cursor** | 40 hard | None | ❌ Incompatible | 40-tool hard cap; use Windsurf or Claude Code instead |
+
+**Recommendation:** Claude Code is the optimal client for openstudio-mcp. It is the only client with dynamic tool discovery that handles 142 tools efficiently and without manual configuration.
+
+---
+
+## Canonical Docker Server Config
+
+Every client needs a block that tells it how to launch the server. The **core Docker command** is the same in all cases — only the key names differ by client.
+
+```json
+{
+  "command": "docker",
+  "args": [
+    "run", "--rm", "-i",
+    "-v", "/ABSOLUTE/PATH/TO/inputs:/inputs",
+    "-v", "/ABSOLUTE/PATH/TO/runs:/runs",
+    "-e", "OPENSTUDIO_MCP_MODE=prod",
+    "openstudio-mcp:dev", "openstudio-mcp"
+  ]
+}
+```
+
+**Required substitutions:**
+- `/ABSOLUTE/PATH/TO/inputs` — folder containing your `.osm` and weather files
+- `/ABSOLUTE/PATH/TO/runs` — folder where simulation outputs will be written
+
+> **Use absolute paths.** Many clients run the command from an unpredictable working directory, so relative paths like `./runs` will silently fail or point to the wrong location.
+
+**Optional: mount skill guides for `get_skill()` / `list_skills()` access**
+
+```json
+"-v", "/ABSOLUTE/PATH/TO/openstudio-mcp/.claude/skills:/skills:ro",
+```
+
+See each client guide for how to embed this block in the client's specific config format.
+
+---
+
+## Guide Index
+
+- [Claude Desktop](./claude-desktop.md) — Recommended starting point; GUI client with full tool support
+- [Claude Code](./claude-code.md) — Best for power users; ToolSearch handles 142 tools efficiently
+- [VS Code Copilot](./vs-code-copilot.md) — VS Code 1.99+; 128-tool limit, workspace-scoped config
+- [Windsurf](./windsurf.md) — Cascade AI; 100-tool limit requires manual tool selection
+- [Gemini CLI](./gemini-cli.md) — Terminal-based; 1M token context; use `includeTools` to subset
+- [Cursor](./cursor.md) — Not compatible; 40-tool hard cap; alternatives listed
+
+## Reference
+
+- [Token Context & Performance Impact](./token-context-performance.md) — How the 142-tool surface affects each client's context budget
diff --git a/docs/clients/token-context-performance.md b/docs/clients/token-context-performance.md
new file mode 100644
index 0000000..a652f1f
--- /dev/null
+++ b/docs/clients/token-context-performance.md
@@ -0,0 +1,324 @@
+# Token Context & Performance Impact
+
+This document covers the measurable cost of connecting openstudio-mcp to an LLM client: how many tokens the 142 tools consume, how different clients handle that load, and what strategies reduce the overhead.
+
+Schema measurements are live — extracted directly from the running MCP server via `tools/list` JSON-RPC (see [Measuring Schema Size](#measuring-schema-size) below). LLM accuracy figures are from the three-model benchmark sweep (180 tests, zero retries; see [`docs/knowledge/tool-discovery-and-llm-testing.md`](../knowledge/tool-discovery-and-llm-testing.md)).
+
+---
+
+## What Adds to Context
+
+When an MCP client connects to openstudio-mcp, the following items may enter the model's context window:
+
+| Item | Size | When loaded |
+|------|------|-------------|
+| Tool schemas (all 142) — full JSON | ~117K chars / **~29K tokens** | On first tool call or session start |
+| Server instructions (`NEVER`/`ALWAYS` rules) | ~550 tokens | Once per session |
+| Skill guide content (`get_skill()` output) | 1–4K tokens per guide | When explicitly requested |
+| MCP prompts / resources | ~0.5K tokens each | When explicitly invoked |
+| Conversation history | Grows per turn | Accumulates throughout session |
+
+**Total fixed overhead on first tool call: ~30–32K tokens.**
+
+For comparison, a full simulation run (create building → simulate → extract results → compare) takes approximately **15K total tokens** in conversation — about half the schema overhead alone.
+
+> **Note on schema token counting:** Early measurements reported ~61K chars / ~15K tokens. That figure counted **names + descriptions only** and omitted the JSON input schemas (parameter names, types, enums, defaults). The full JSON payload an LLM actually receives is 117K chars / ~29K tokens. Both figures are accurate for their stated scope; use the full-JSON number for context budget planning.
+
+---
+
+## Schema Size History
+
+The schema size has been measured at multiple points in the project:
+
+| Date | Tools | Schema Chars (full JSON) | Est. Tokens | Change |
+|------|-------|--------------------------|-------------|--------|
+| Feb 2026 | 62 | ~54K | ~13.5K | Initial |
+| Mar 2026 | 126 | ~175K | ~44K | +64 tools |
+| Mar 2026 (post-compress) | 127 | ~108K | ~27K | 30% description compression |
+| Apr 2026 | 142 | **117K** | **~29K** | +15 tools; measured live |
+
+Breakdown of the Apr 2026 117K chars:
+- Names: ~2.7K chars (~673 tokens)
+- Descriptions: ~58.7K chars (~14.7K tokens)
+- Input schemas (params/types/enums): ~36.1K chars (~9K tokens)
+- JSON structure overhead: ~19.5K chars (~4.9K tokens)
+
+Key lesson: description compression reduced schema size but harmed ToolSearch accuracy (compressed descriptions had fewer keywords for BM25 matching). The current schema is a deliberate balance between size and discoverability.
+
+---
+
+## Per-Client Context Budget
+
+### Context windows
+
+Measured April 2026 from live `tools/list` response (117,047 chars / 142 tools):
+
+| Client / Model | Context Window | Schema Tokens | Overhead % | Notes |
+|----------------|---------------|---------------|-----------|-------|
+| Claude Code (Sonnet 4.7) | 200K tokens | **~1K tokens*** | **~0.5%*** | ToolSearch defers all; 3–5 tools/turn |
+| Claude Desktop (Sonnet 4.7) | 200K tokens | ~29K tokens | ~14.6% | All 142 schemas in context |
+| VS Code Copilot (GPT-4.1) | 128K tokens | ~28K tokens† | ~22.0%† | 128-tool cap enforced |
+| VS Code Copilot (Claude Sonnet 4.7) | 200K tokens | ~28K tokens† | ~14.1% | 128-tool cap enforced |
+| VS Code Copilot (Gemini 2.5 Flash) | 1M tokens | ~28K tokens† | ~2.8% | 128-tool cap enforced |
+| Windsurf / 80-tool curated | 200K tokens | ~16.5K tokens | ~8.2% | Manual curation required |
+| Gemini CLI (Gemini 2.5 Pro) | 1M tokens | ~29K tokens | ~2.9% | Use `includeTools` to reduce |
+
+\* Claude Code ToolSearch defers all tools; only 3–5 schemas (~820–1,030 tokens at ~205 tokens/tool avg) load per turn.  
+† VS Code Copilot enforces a 128-tool cap; 14 smallest tools excluded, saving ~1.1K tokens. The 14 excluded tools (based on schema size) are: `get_run_period`, `get_versions`, `get_server_status`, `get_weather_info`, `match_surfaces`, `get_simulation_control`, `cancel_run`, `enable_ideal_air_loads`, `set_lifecycle_cost_params`, `extract_hvac_sizing`, `extract_zone_summary`, `get_run_artifacts`, `extract_envelope_summary`, `get_zone_hvac_details`.
+
+### When Context Pressure Becomes a Problem
+
+Claude Code triggers ToolSearch automatically when schemas exceed 10% of context. For other clients, the model itself must manage context. Signs of context pressure:
+
+- Model begins truncating or paraphrasing earlier in the conversation
+- Tool calls start failing to pass correct parameter values (model "forgets" schema details)
+- Model stops using tools entirely and falls back to explaining what it would do
+- Long simulation chains: after 20+ turns with large intermediate results, accuracy drops
+
+**Practical guideline:** At ~29K tokens of schema overhead, Claude Desktop and VS Code Copilot (GPT-4.1 on 128K context) already spend ~15–22% of their budget before any conversation. Plan for 10–15 high-quality turns on complex workflows. Start a new conversation and reference `/runs/` output paths to continue.
+
+---
+
+## How Clients Handle 142 Tools
+
+### Claude Code: ToolSearch (Deferred Loading)
+
+ToolSearch indexes all 142 tools at image build time using BM25/regex on names and descriptions. When schemas exceed 10% of context, tools are deferred. Per turn:
+- ~3–5 tool schemas load into context (~1K tokens, ~97% reduction vs. 29K)
+- Schema overhead per turn: ~1,030 tokens (5 tools × ~205 tokens/tool avg)
+- Works because the ToolSearch index holds the full schema catalog outside context
+
+**Benchmark result:** 94.4% pass rate (Sonnet/Opus, 180 tests, zero retries). ToolSearch calls: avg 1.9/test.
+
+### Claude Desktop / VS Code Copilot: Brute-Force Load
+
+All enabled tool schemas load into context on the first tool call. No deferred loading, no filtering. Performance effect:
+- First response in a new session has ~14–22% context already consumed (vs. ~0.5% for Claude Code)
+- Accuracy stays high for shorter sessions (5–10 turns)
+- Long sessions may show degradation as conversation history + schema + results approach the context limit
+- VS Code Copilot with GPT-4.1 (128K window) is most constrained: ~22% of context consumed before the first user message
+
+### Windsurf: Per-Tool Toggle (Manual Curation)
+
+Cascade enforces 100 tools hard. User selects which tools are enabled. With a curated 80-tool set (~16.5K tokens), the overhead is ~43% lower than loading all 142. Manual curation adds setup friction but produces the most focused tool surface.
+
+### Gemini CLI: Large Context Buffer
+
+1M token context window means schema overhead (~29K tokens = ~2.9%) is low even at full load. The practical concern is accuracy per turn, not context exhaustion — presenting all 142 tools at once can confuse the model. Use `includeTools` to keep per-turn tool count under ~40.
+
+---
+
+## Strategies to Reduce Context Overhead
+
+### 1. Use `list_skills` + `get_skill` First (Universal)
+
+Instead of letting the model search all 142 tools, ask it to follow a skill guide. The guide gives explicit tool names and order, bypassing tool discovery entirely:
+
+```
+"Use the new-building skill to create a medium office building in Boston."
+```
+
+vs.
+
+```
+"Create a medium office building in Boston."  ← model must select from 142 tools
+```
+
+Both work, but the first produces fewer ToolSearch calls and more predictable tool sequences.
+
+### 2. Enable `defer_loading` (OpenAI-Compatible Clients)
+
+For clients that support the OpenAI `defer_loading` flag, set it on the server config. This exposes only a search tool by default and loads schemas on demand. Reduces first-call overhead by ~85%.
+
+### 3. Use `includeTools` / Per-Tool Toggles (Windsurf, Gemini CLI)
+
+Configure a focused tool subset matching your current workflow phase. A 30-tool simulation workflow subset (~4–5K tokens) is well within any client's context budget and produces cleaner responses than exposing all 142.
+
+### 4. Reference `/runs/` Paths, Not Inline Results
+
+Instead of asking the model to read and summarize large simulation outputs inline, reference them by path:
+
+```
+"The simulation output is at /runs/run-20260415/. Extract the EUI."
+```
+
+This lets `extract_summary_metrics` and `extract_end_use_breakdown` do targeted extraction rather than streaming the full HTML report into context.
+
+### 5. Split Long Workflows Across Conversations (Claude Desktop)
+
+Save model state at key checkpoints with `save_osm_model`. Start a fresh conversation for the next phase. Reference saved files by path. This resets conversation history overhead while preserving all model changes.
+
+---
+
+## Tool Call Latency
+
+Measured April 2026 via OpenLLMetry traces (Jaeger). Environment: Apple M3 Max, Docker amd64 emulation, openstudio-mcp:tracing image.
+
+### Cold Start (first tool call per Docker container)
+
+Includes: Docker container launch + Python server init + first `import openstudio`.
+
+| Phase | Latency |
+|-------|---------|
+| Container start → MCP `initialize` response | ~1.8–2.1s |
+| First `import openstudio` (cold) | included above |
+| tools/list (full 142 schema, 117K chars) | ~1s (bundled at init) |
+
+### Warm Tool Call Latency (in-session, OpenStudio already loaded)
+
+| Tool | Avg latency | Notes |
+|------|-------------|-------|
+| `get_server_status` | ~3ms | No OpenStudio ops |
+| `list_skills` | ~1ms | File read only |
+| `validate_model` | ~3ms | Model checks |
+| `list_spaces` / `list_thermal_zones` / `list_air_loops` | 1–5ms | In-memory iteration |
+| `list_weather_files` | ~12ms | EPW file scan |
+| `get_building_info` | ~5–15ms | Model introspection |
+| `get_model_summary` | ~8–23ms | Full object count |
+| `get_versions` | ~150–220ms | OpenStudio SDK call |
+| `create_example_osm` | ~200–215ms | Model build from scratch |
+
+> **Note:** These latencies reflect the MCP server's processing time. Client-visible latency adds the LLM inference time on top (typically 1–10s for tool call generation + JSON parse). The server itself is fast; bottlenecks in multi-step workflows are almost always LLM inference, not tool execution.
+
+### Running Traces Yourself
+
+```bash
+# 1. Start Jaeger
+docker compose -f docker/docker-compose.tracing.yml up -d
+
+# 2. Build tracing image
+docker build --build-arg TELEMETRY=1 -t openstudio-mcp:tracing -f docker/Dockerfile .
+
+# 3. Connect your client with tracing env vars (see compose file header)
+# Traces appear at http://localhost:16686
+```
+
+---
+
+## LLM Accuracy vs. Tool Count
+
+From internal benchmarks and published research:
+
+| Tools Presented Per Turn | Accuracy | Source |
+|--------------------------|----------|--------|
+| 5–7 | ~92% | Jenova.ai |
+| 10–15 | sweet spot | Multiple |
+| 3–5 (ToolSearch output) | 94.4% | openstudio-mcp sweep |
+| 40+ (all visible, no deferral) | Degraded | Allen Chan / IBM |
+| 100+ (no retrieval) | ~13–14% | RAG-MCP |
+| 100+ (with semantic retrieval) | ~43% | RAG-MCP |
+
+The openstudio-mcp benchmark shows 94.4% at 142 tools **because ToolSearch reduces the per-turn visible set to 3–5**. Without ToolSearch (e.g., Claude Desktop), the effective tool count visible to the model per turn is still all 142, but Claude's reasoning capability keeps accuracy high for sessions under ~20 turns.
+
+---
+
+## Local LLM Benchmark: llama3.2:3b vs gemma3:4b
+
+Measured April 2026 on Apple M3 Max (14 CPU, 36 GB RAM, no GPU) via Ollama v0.20.7 + [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) 0.4.11. Tasks: **GSM8K** (math / chain-of-thought reasoning) and **IFEval** (instruction following). 100 samples per task, zero-shot.
+
+| Model | GSM8K flexible-extract | IFEval prompt loose | IFEval inst loose | Runtime | Disk |
+|---|---|---|---|---|---|
+| llama3.2:3b (Meta, US) | **0.67** | 0.630 | 0.767 | 9m35s | 2.0 GB |
+| gemma3:4b (Google DeepMind, US) | 0.55 | **0.750** | **0.828** | 16m39s | 3.3 GB |
+
+**Takeaways:**
+- `llama3.2:3b` wins on math/reasoning (0.67 vs 0.55 GSM8K).
+- `gemma3:4b` wins on instruction following (0.750 vs 0.630 IFEval prompt-level loose) — more relevant for agentic tool use.
+- `gemma3:4b` **does not support native tool calling in Ollama** — the `/api/chat` endpoint returns HTTP 400 for any `tools` field. This makes it unsuitable for measuring MCP schema overhead or running tool-calling benchmarks with Ollama.
+- **`llama3.2:3b` is the recommended model for CI-based MCP overhead benchmarks**: native tool calling, 2.0 GB fits comfortably on GitHub Actions `ubuntu-latest` (16 GB RAM), and is from a US-based company.
+
+### How to Reproduce
+
+```bash
+# Pull both models
+ollama pull llama3.2:3b
+ollama pull gemma3:4b
+
+# Start server
+ollama serve &
+
+# Install deps
+pip install lm_eval langdetect immutabledict nltk
+python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('stopwords')"
+
+# Run benchmark (100 samples each, ~10 min for llama3.2:3b)
+python3 -m lm_eval \
+  --model local-chat-completions \
+  --model_args "model=llama3.2:3b,base_url=http://localhost:11434/v1/chat/completions,num_concurrent=1,max_retries=3,tokenized_requests=False" \
+  --tasks gsm8k,ifeval \
+  --num_fewshot 0 \
+  --limit 100 \
+  --apply_chat_template \
+  --output_path /tmp/lmeval_results/llama32_3b
+```
+
+> **Note:** Use `--model local-chat-completions` (not `openai-chat-completions`). The `base_url` must be the full chat completions path. Loglikelihood tasks (ARC, HellaSwag, MMLU) raise `NotImplementedError` for chat models; only `generate_until` tasks like `gsm8k` and `ifeval` work.
+
+---
+
+## Token Overhead by Scenario (Ollama Measurement)
+
+Measured April 2026 using Ollama `prompt_eval_count` — the actual number of prompt tokens processed. Prompt: "What is the total floor area of the current building?" Three runs per scenario, median reported.
+
+| Scenario | Tools | llama3.2:3b tokens | Delta | First-call latency |
+|---|---|---|---|---|
+| No tools (baseline) | 0 | 36 | — | 0.14s |
+| 5 tools | 5 | 529 | +493 | 0.07s |
+| 30 tools | 30 | 2,579 | +2,543 | 0.09s |
+| 142 tools (synthetic, compact) | 142 | 11,763 | +11,727 | 0.19s |
+| **142 tools (real openstudio-mcp)** | **142** | **~29,000** | **~28,964** | ~0.2s |
+
+The synthetic tools used in the Ollama benchmark averaged ~83 tokens each (compact schema). Real openstudio-mcp tools average **~204 tokens each** (detailed descriptions + parameter schemas), so the real delta scales to approximately **~29K tokens** — consistent with the live server measurements above.
+
+**gemma3:4b:** baseline only (36 → 20 tokens, no-tools prompt). All tool-bearing requests returned HTTP 400 — `gemma3:4b` does not support tool calling in Ollama's `/api/chat` endpoint. Cannot measure MCP overhead with this model via Ollama.
+
+---
+
+## Measuring Schema Size
+
+To reproduce the schema measurements in this document, run against the live MCP server:
+
+```python
+import subprocess, json
+
+cmd = ["docker", "run", "--rm", "-i", "-e", "OPENSTUDIO_MCP_MODE=prod",
+       "openstudio-mcp:dev", "openstudio-mcp"]
+
+init_msg = json.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize",
+    "params": {"protocolVersion": "2024-11-05", "capabilities": {},
+               "clientInfo": {"name": "measure", "version": "1.0"}}}) + "\n"
+list_msg = json.dumps({"jsonrpc": "2.0", "id": 2, "method": "tools/list",
+    "params": {}}) + "\n"
+
+proc = subprocess.run(cmd, input=(init_msg + list_msg).encode(),
+                      capture_output=True, timeout=30)
+for line in proc.stdout.decode().split("\n"):
+    try:
+        obj = json.loads(line)
+        if obj.get("id") == 2:
+            tools = obj["result"]["tools"]
+            schema_json = json.dumps(tools)
+            print(f"Tools: {len(tools)}")
+            print(f"Full JSON chars: {len(schema_json):,}")
+            print(f"Est tokens: {len(schema_json)//4:,}")
+            desc_chars = sum(len(t.get("description","")) for t in tools)
+            print(f"Descriptions only: {desc_chars:,} chars / ~{desc_chars//4:,} tokens")
+    except Exception:
+        pass
+```
+
+---
+
+## Evaluation Checklist
+
+When comparing client performance against openstudio-mcp, measure:
+
+- [ ] **First tool call latency** — time from prompt to first tool invocation
+- [ ] **Schema token overhead** — use the script above; compare to client's token counter
+- [ ] **ToolSearch calls per workflow** — how often the model searches before acting
+- [ ] **Accuracy at turn 5 vs. turn 20** — does accuracy degrade in long sessions?
+- [ ] **Failure mode when context is full** — does the model warn, truncate, or silently fail?
+- [ ] **`list_skills` adherence** — does the model follow the skill guide or guess tool params?
+- [ ] **Trace latency** — instrument with `TRACELOOP_BASE_URL` + Jaeger to see per-tool call times (see [`docker/docker-compose.tracing.yml`](../../docker/docker-compose.tracing.yml))
+
+See [`docs/testing/advanced-evaluation-template.md`](../testing/advanced-evaluation-template.md) for a full structured evaluation form.
diff --git a/docs/clients/vs-code-copilot.md b/docs/clients/vs-code-copilot.md
new file mode 100644
index 0000000..1b5cf26
--- /dev/null
+++ b/docs/clients/vs-code-copilot.md
@@ -0,0 +1,106 @@
+# VS Code Copilot Setup
+
+> **Last verified:** April 2026 · VS Code 1.99+ · [Docs](https://code.visualstudio.com/docs/copilot/chat/mcp-servers)
+
+VS Code Copilot (GitHub Copilot Chat in agent mode) supports MCP servers from VS Code 1.99 onward. The 128-tool hard limit is within openstudio-mcp's 142-tool count, so you need to either disable 14+ tools via the UI or use the workspace config's tool filtering.
+
+---
+
+## Prerequisites
+
+- **Docker Desktop** running
+- **VS Code 1.99 or later** ([download](https://code.visualstudio.com/))
+- **GitHub Copilot** extension installed and active subscription
+- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .`
+
+---
+
+## Configuration
+
+VS Code uses `.vscode/mcp.json` (workspace) or the user-profile MCP config (global). **Note:** VS Code uses the key `"servers"`, not `"mcpServers"` — this is different from Claude Desktop and Windsurf.
+
+**Workspace config** (`.vscode/mcp.json` in your project root — can be committed):
+
+```json
+{
+  "servers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/your/inputs:/inputs",
+        "-v", "/absolute/path/to/your/runs:/runs",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
+```
+
+**Global config** (all workspaces): open the Command Palette (`Cmd/Ctrl+Shift+P`) and run `MCP: Open User Configuration`. The file uses the same `"servers"` key format.
+
+---
+
+## Enabling MCP in Agent Mode
+
+MCP tools are only available in **GitHub Copilot Chat agent mode** (`@workspace` / `@agent`). Regular inline completions do not use MCP tools.
+
+1. Open GitHub Copilot Chat (`Ctrl+Alt+I` / `Cmd+Ctrl+I`)
+2. Switch to agent mode with the dropdown in the chat panel header
+3. Click **Configure Tools** (wrench icon) to see all available MCP tools and toggle them on/off
+4. Send a test prompt:
+
+   > *"Use openstudio-mcp to create an example building model and describe it."*
+
+---
+
+## Handling the 128-Tool Limit
+
+openstudio-mcp provides 142 tools. VS Code Copilot has a 128-tool hard cap across all active MCP servers. If you have other MCP servers enabled, the limit applies to the combined total.
+
+**Option A: Disable low-priority tools via the UI**
+
+In the Configure Tools panel, disable tools you won't use. Good candidates to disable for a first evaluation:
+- `add_pv_to_shading`, `add_ev_load`, `set_lifecycle_cost_params` (renewables/cost, if not needed)
+- `inspect_osm_summary`, `validate_osw`, `run_osw` (advanced file ops)
+- `add_design_day` (if using `change_building_location` instead)
+
+**Option B: Use `includeTools` in the config** *(when supported — check VS Code release notes)*
+
+Some VS Code versions support an `includeTools` array to pre-filter exposed tools before the 128 limit is applied. Check the [MCP configuration reference](https://code.visualstudio.com/docs/copilot/reference/mcp-configuration) for the current schema.
+
+---
+
+## First Prompts
+
+Use these in agent mode (`@agent`):
+
+```
+@agent Create a baseline 10-zone office building with System 7 VAV reheat
+@agent Load /inputs/MyBuilding.osm and list all thermal zones with their setpoints
+@agent Run a simulation on /runs/baseline.osm and show me the EUI
+```
+
+---
+
+## Context & Performance Notes
+
+VS Code Copilot's context window depends on the model selected in the chat panel:
+- GPT-4.1: 128K tokens
+- Claude 3.7 Sonnet: 200K tokens
+- Gemini 2.0 Flash: 1M tokens
+
+With 128 tools loaded, schema overhead is roughly **13–14K tokens** (reduced slightly from the full 142). See [Token Context & Performance](./token-context-performance.md) for a full breakdown.
+
+---
+
+## Troubleshooting
+
+| Symptom | Likely cause | Fix |
+|---------|-------------|-----|
+| No MCP tools appear | Using wrong key (`mcpServers` instead of `servers`) | Check `.vscode/mcp.json` uses `"servers"` |
+| "Trust this server?" prompt blocks startup | New server security check | Click "Trust" to allow the server to start |
+| Tools appear but agent ignores them | Not in agent mode | Switch chat panel to agent mode |
+| 128-tool limit error | Too many tools across all servers | Disable low-priority tools via Configure Tools panel |
+| Config not picked up | Wrong file location | Confirm `.vscode/mcp.json` exists in the workspace root you opened |
diff --git a/docs/clients/windsurf.md b/docs/clients/windsurf.md
new file mode 100644
index 0000000..86b3330
--- /dev/null
+++ b/docs/clients/windsurf.md
@@ -0,0 +1,119 @@
+# Windsurf (Cascade) Setup
+
+> **Last verified:** April 2026 · Windsurf latest · [Docs](https://docs.windsurf.com/windsurf/cascade/mcp)
+
+Windsurf's Cascade AI supports MCP via a global config file. The **100-tool hard limit** means openstudio-mcp is not plug-and-play — you must disable at least 42 tools before Cascade will connect. This guide covers which tools to keep for common BEM workflows.
+
+---
+
+## Prerequisites
+
+- **Docker Desktop** running
+- **Windsurf** installed ([download](https://windsurf.com/download))
+- openstudio-mcp image built: `docker build -t openstudio-mcp:dev -f docker/Dockerfile .`
+
+---
+
+## Configuration
+
+Edit (or create) `~/.codeium/windsurf/mcp_config.json`:
+
+```json
+{
+  "mcpServers": {
+    "openstudio-mcp": {
+      "command": "docker",
+      "args": [
+        "run", "--rm", "-i",
+        "-v", "/absolute/path/to/your/inputs:/inputs",
+        "-v", "/absolute/path/to/your/runs:/runs",
+        "-e", "OPENSTUDIO_MCP_MODE=prod",
+        "openstudio-mcp:dev", "openstudio-mcp"
+      ]
+    }
+  }
+}
+```
+
+After saving, open the **MCP panel** in Windsurf (click the MCPs icon in the top-right Cascade panel). The openstudio-mcp server will appear but will be over the 100-tool limit. Proceed to tool selection below.
+
+---
+
+## Selecting Tools (Required)
+
+Cascade has a hard cap of 100 active tools across all MCP servers. From the MCP settings page, toggle tools off until you are at or below 100. The table below shows a **recommended 80-tool starter set** organized by workflow.
+
+### Always Keep (Core — 22 tools)
+
+| Tool | Why |
+|------|-----|
+| `list_skills`, `get_skill` | Workflow guides — most important for BEM orientation |
+| `get_server_status`, `get_versions` | Health checks |
+| `load_osm_model`, `save_osm_model`, `create_example_osm`, `create_baseline_osm` | Model load/save |
+| `get_model_summary`, `get_building_info` | Model inspection |
+| `list_thermal_zones`, `list_spaces`, `list_air_loops`, `list_plant_loops` | Inventory tools |
+| `run_simulation`, `get_run_status` | Simulation |
+| `extract_summary_metrics`, `extract_end_use_breakdown` | Results |
+| `validate_model` | Pre-sim QA |
+| `recommend_tools` | Tool router |
+| `change_building_location` | Weather setup |
+
+### Add by Workflow
+
+| Workflow | Additional Tools to Enable |
+|----------|--------------------------|
+| New building from scratch | `create_new_building`, `create_bar_building`, `create_typical_building` |
+| HVAC changes | `add_baseline_system`, `list_baseline_systems`, `add_air_loop`, `list_zone_hvac_equipment`, `set_component_properties` |
+| Envelope work | `list_surfaces`, `list_subsurfaces`, `replace_window_constructions`, `get_construction_details`, `list_materials` |
+| Measures | `create_measure`, `test_measure`, `apply_measure`, `list_custom_measures` |
+| Results deep-dive | `extract_hvac_sizing`, `extract_envelope_summary`, `extract_zone_summary`, `compare_runs`, `generate_results_report` |
+| Schedules/loads | `list_thermal_zones` (detailed), `adjust_thermostat_setpoints`, `get_schedule_details` |
+
+### Safe to Disable
+
+Low-use tools that can be re-enabled on demand:
+- `add_pv_to_shading`, `add_rooftop_pv`, `add_ev_load` — renewables
+- `set_lifecycle_cost_params`, `add_cost_per_floor_area` — lifecycle costing
+- `set_adiabatic_boundaries` — special boundary conditions
+- `inspect_osm_summary`, `validate_osw`, `run_osw` — raw file operations
+- `import_floorspacejs` — only if not doing custom geometry imports
+- `add_design_day` — only if not defining custom design days
+
+---
+
+## Verification
+
+After configuring your tool selection:
+
+1. Restart Windsurf or click "Refresh" in the MCP panel
+2. Confirm the tool count shows ≤ 100 in the panel
+3. Send a test prompt in Cascade:
+
+   > *"Use the openstudio MCP tools to list the available skills."*
+
+---
+
+## First Prompts
+
+```
+"Create a baseline 5-zone office building with VAV reheat using openstudio-mcp"
+"Load /inputs/MyBuilding.osm and tell me about its HVAC system"
+"Run a simulation on /runs/model.osm and extract the EUI"
+```
+
+---
+
+## Context & Performance Notes
+
+Windsurf puts all enabled tool schemas into context without deferral, similar to Claude Desktop. With a curated 80-tool set, schema overhead is approximately **9–11K tokens**. See [Token Context & Performance](./token-context-performance.md).
+
+---
+
+## Troubleshooting
+
+| Symptom | Likely cause | Fix |
+|---------|-------------|-----|
+| "Tool limit exceeded" error | More than 100 tools enabled | Disable tools via MCP settings panel |
+| Server listed but tools not available | Tool count > 100 (Cascade rejects the whole server) | Must disable tools before connecting |
+| Config not picked up | Wrong path | Confirm `~/.codeium/windsurf/mcp_config.json` — note: `windsurf/` subdirectory, not `codeium/` directly |
+| Cascade uses its own tools instead of MCP | Prompt doesn't mention MCP | Include "use openstudio-mcp" or "use the openstudio tools" in your prompt |
diff --git a/docs/examples/20_deep_retrofit_package.md b/docs/examples/20_deep_retrofit_package.md
new file mode 100644
index 0000000..47cb8a7
--- /dev/null
+++ b/docs/examples/20_deep_retrofit_package.md
@@ -0,0 +1,148 @@
+# Example 20: Hackathon — Deep Retrofit Package Analysis
+
+Stack energy conservation measures on a baseline office building, simulate before and after, and quantify cumulative savings — all in a single AI-assisted session.
+
+## Scenario
+
+A hackathon team wants to find the maximum achievable energy savings from bundling multiple ECMs on a small commercial office building in Boston. They have four hours to model, simulate, and present results.
+
+The **core package** (validated by integration test) stacks:
+1. **High-R wall insulation** — upgrade all exterior walls to R-20
+2. **Thermostat deadband widening** — expand heating/cooling deadband by 2°F each way
+
+**Optional extensions** (require OpenStudio extension gem environment):
+3. **High-performance windows** — replace all glazing with a better-performing construction
+4. **Rooftop photovoltaics** — add PV panels on 75% of roof area
+
+## Prompt
+
+> Create a small office building in Boston. Run a baseline simulation, then stack high-R wall insulation and thermostat widening, re-simulate, and show me the savings from the package.
+
+## Tool Call Sequence
+
+### Step 1 — Baseline Model + Simulation
+
+```
+1. create_baseline_osm(name="office_baseline", ashrae_sys_num="03", wwr=0.4)
+2. load_osm_model(osm_path=<returned path>)
+3. change_building_location(weather_file="/inputs/USA_MA_Boston-Logan.Intl.AP.725090_TMY3.epw")
+4. save_osm_model(osm_path="/runs/office_baseline.osm")
+5. run_simulation(osm_path="/runs/office_baseline.osm")
+6. get_run_status(run_id=<id>)           # poll until "success"
+7. extract_summary_metrics(run_id=<id>)  # record baseline EUI
+```
+
+### Step 2 — ECM 1: High-R Wall Insulation
+
+Create an R-20 wall construction and assign it to every exterior wall.
+
+```
+8.  create_standard_opaque_material(
+      name="R20_Insulation",
+      thickness_m=0.141,          # R-20 IP: k=0.04 W/m-K → 3.52 m²·K/W
+      conductivity_w_m_k=0.04,
+      density_kg_m3=30.0,
+      specific_heat_j_kg_k=1000.0)
+9.  create_construction(name="High_R_Wall", material_names=["R20_Insulation"])
+10. get_construction_details(construction_name="High_R_Wall")  # verify R-value
+11. list_surfaces(surface_type="Wall", boundary="Outdoors", max_results=0)
+    # For each exterior wall surface name:
+12. assign_construction_to_surface(
+      surface_name=<wall>, construction_name="High_R_Wall")
+    ... repeat for each exterior wall ...
+```
+
+### Step 3 — ECM 2: Thermostat Deadband Widening
+
+Expand the heating/cooling deadband by 2°F in each direction.
+
+```
+13. adjust_thermostat_setpoints(cooling_offset_f=2.0, heating_offset_f=-2.0)
+```
+
+### Step 4 — Retrofit Simulation + Comparison
+
+```
+14. save_osm_model(osm_path="/runs/office_retrofit.osm")
+15. run_simulation(osm_path="/runs/office_retrofit.osm")
+16. get_run_status(run_id=<id>)           # poll until "success"
+17. compare_runs(
+      baseline_run_id=<baseline_id>,
+      retrofit_run_id=<retrofit_id>)
+```
+
+### Optional Extensions
+
+#### ECM 3: High-Performance Windows (requires gem environment)
+
+Use a `SimpleGlazing` material (U-factor + SHGC) to define window performance, then replace all glazing:
+
+```
+18. create_measure(name="add_simple_glazing", ...)  # write a measure that calls
+    #   SimpleGlazing.new(model, u_factor, shgc) and wires it into a Construction
+19. apply_measure(measure_dir=..., arguments={u_factor: 1.2, shgc: 0.25})
+20. replace_window_constructions(construction_name="HighPerf_Window")
+```
+
+> **Note:** `create_standard_opaque_material` creates wall/roof layers — not glazing.
+> Window constructions require glazing materials (SimpleGlazing or StandardGlazing)
+> authored via `create_measure`. See Example 01 for the measure-authoring pattern.
+
+#### ECM 4: Rooftop PV (requires gem environment)
+
+```
+19. add_rooftop_pv(fraction_of_surface=0.75, cell_efficiency=0.18)
+```
+
+> **Note on PV comparison:** `compare_runs` tracks end-use energy consumption.
+> For PV, also check `extract_summary_metrics` to see the site energy reduction
+> that accounts for onsite generation.
+
+## Expected Results
+
+| Scenario | EUI (kBtu/ft²/yr) | Savings vs. Baseline |
+|----------|-------------------|----------------------|
+| Baseline (System 3 PSZ-AC, Boston) | ~50–65 | — |
+| + High-R Insulation | ~48–62 | ~3–6% |
+| + Thermostat Widening (cumulative) | ~44–57 | ~8–15% |
+| + Window Upgrade (if available) | ~42–54 | ~12–18% |
+| + Rooftop PV offset | ~37–49 | ~18–28% |
+
+*Exact values depend on model geometry and weather year.*
+
+## Key Tools Used
+
+| Tool | Purpose |
+|------|---------|
+| `create_baseline_osm` | 10-zone model with PSZ-AC system and glazing |
+| `change_building_location` | Weather file + design days (Boston TMY3) |
+| `create_standard_opaque_material` | Define insulation layer (k, density, Cp, thickness) |
+| `create_construction` | Assemble opaque material layers into a wall construction |
+| `get_construction_details` | Verify R-value of new assembly |
+| `list_surfaces` | Find all exterior walls by boundary condition |
+| `assign_construction_to_surface` | Apply new construction wall-by-wall |
+| `adjust_thermostat_setpoints` | Widen heating/cooling deadband |
+| `replace_window_constructions` | Swap all window glazing in one call (optional) |
+| `add_rooftop_pv` | Add PV panels on roof shading surfaces (optional) |
+| `run_simulation` | Launch EnergyPlus |
+| `compare_runs` | EUI delta + per-end-use breakdown between two runs |
+
+## Why This is a Great Hackathon Demo
+
+This workflow demonstrates the unique value of AI-assisted building energy modeling:
+
+- **Manual equivalent**: 4–8 hours per scenario in the OpenStudio GUI
+- **With openstudio-mcp**: All ECMs modeled, simulated, and compared in minutes
+- **Narrative**: "Here's the pathway to 15% savings from just two measures"
+- **Extensibility**: Swap in any building type, location, or ECM package
+
+## Notes
+
+- Save the baseline model **before** applying any ECMs — the ECMs modify in-memory state
+- ECM 1 uses a single-layer R-20 construction as a simplified approximation of a real multi-layer wall assembly; real projects would retain the original assembly and add an insulation layer
+- `compare_runs` reports `energy_grand_total_kBtu` (consumption only, excludes water); for PV-inclusive scenarios, use `extract_summary_metrics` site energy instead
+- `replace_window_constructions` and `add_rooftop_pv` require OpenStudio extension gems; the core ECMs (insulation + thermostat) work with the SDK alone
+
+## Integration Test
+
+See `tests/test_skill_ecm_package.py::test_skill_ecm_package_workflow`
diff --git a/docs/testing/advanced-evaluation-template.md b/docs/testing/advanced-evaluation-template.md
new file mode 100644
index 0000000..e615922
--- /dev/null
+++ b/docs/testing/advanced-evaluation-template.md
@@ -0,0 +1,154 @@
+# OpenStudio-MCP: Advanced Evaluation & Workflow Log
+
+**Date:** [YYYY-MM-DD]  
+**Evaluator:** [Your Name]  
+**Session ID/Commit:** [Insert branch or commit hash]
+
+---
+
+## How to Use This Template
+
+This template guides manual evaluation of the `openstudio-mcp` server across six areas.
+Use it alongside the automated test suite — the tests cover deterministic behavior;
+this template captures LLM-specific behavior that cannot be unit tested.
+
+**Suggested time allocation (20-hour eval):**
+
+| Hours | Focus |
+|---|---|
+| 1–5 | Sections 1 & 2 — Does the LLM use Skills correctly? |
+| 6–12 | Sections 4 & 5 — Long-session state stability + practitioner workflows |
+| 13–17 | Section 3 — Artifact size limits, where does it break? |
+| 18–20 | Section 6 + write-up, draft SECURITY.md updates if gaps found |
+
+---
+
+## 1. Environment & Setup Adherence
+
+Testing the "onboarding" experience. Does the LLM correctly identify the environment
+and its capabilities?
+
+| MCP Client | LLM Model | Initial Tool Discovery | Did it call `list_skills`? | Setup Friction |
+|---|---|---|---|---|
+| Claude Desktop | Claude 3.5 Sonnet | Ad-hoc / Skills-based | [Yes/No] | [e.g. README missing deps] |
+| Cursor | GPT-4o | Ad-hoc / Skills-based | [Yes/No] | [e.g. Prompting issues] |
+
+**Automated coverage:** `tests/test_skill_registration.py`, `tests/test_skill_docs.py`
+
+---
+
+## 2. The "Skills" Orchestration Layer
+
+Instead of consolidating tools, we test how well the LLM uses the provided Markdown "Skills"
+to navigate 126+ specialized tools.
+
+**Test Goal:** Does the agent follow the "Skill" guide or try to "guess" tool parameters?
+
+| Target Workflow | Skill Used | Adherence Score (1–5) | Observation / Hallucination |
+|---|---|---|---|
+| HVAC Swap | `add-hvac` | [Score] | [e.g. LLM ignored skill and guessed VAV parameters] |
+| Geometry Edit | `tool-workflows` | [Score] | [e.g. Successfully followed skill sequence] |
+| Simulation Run | `tool-workflows` (simulate) | [Score] | [e.g. Tried to run sim before loading model] |
+| Baseline Generation | `ashrae-baseline-guide` | [Score] | [e.g. Wrong system type selected] |
+| QAQC | `qaqc` | [Score] | [e.g. Ran checks on unsimulated model] |
+
+**Scoring guide:**
+- 5 — Followed skill verbatim, correct tool order, correct parameters
+- 4 — Minor deviation (e.g., skipped one optional step), correct outcome
+- 3 — Partial adherence; required correction prompt
+- 2 — Mostly guessed; skill ignored
+- 1 — Completely wrong tool sequence or hallucinated parameters
+
+**Automated coverage:** `tests/llm/` suite (tool selection, routing, progressive workflows)
+
+---
+
+## 3. Data & Artifact Management (Breaking Points)
+
+Testing the limits of `read_file` vs. `copy_file` for large Building Energy Modeling outputs.
+
+| Artifact Type | File Size | Tool Used | Result (Success/Truncated/Crash) | Context Usage |
+|---|---|---|---|---|
+| `eplusout.err` | [e.g. 50 KB] | `read_file` | Success | [Token Count] |
+| `eplusout.html` | [e.g. 2.5 MB] | `read_file` | [e.g. Truncated at 50 KB] | [Token Count] |
+| `eplusout.html` | [e.g. 2.5 MB] | `copy_file` | [e.g. Successful Local Copy] | N/A |
+| `eplusout.eso` | [e.g. 15 MB] | `read_file` | [e.g. Truncated] | [Token Count] |
+| `eplusout.eso` | [e.g. 15 MB] | `copy_file` | [e.g. Successful Local Copy] | N/A |
+
+**Note on Artifact Limits:** At what point did the LLM lose the ability to analyze
+simulation errors? Document the exact file size and token count where analysis degraded.
+
+**Key behavior to verify:**
+- `read_file` returns `truncated: true` and `bytes_read` when file exceeds `max_bytes` (default 50 KB)
+- `copy_file` has no built-in size limit, but can still fail on insufficient disk space,
+  permission errors, or same-filesystem copy constraints — check `ok` field in the response
+- Chunked reading via `offset` parameter allows paginating through large files
+
+**Automated coverage:** `tests/test_artifact_limits.py` (on `test/artifact-security-coverage` branch)
+
+---
+
+## 4. In-Memory Session & State Persistence
+
+Testing the reliability of the SWIG-wrapped in-memory model manager over long,
+high-turn conversations.
+
+| Total Turns | Model Size (.osm) | Persistence Check | Did it "drop" the model state? |
+|---|---|---|---|
+| 5 Turns | 120 KB | Pass — Changes Kept | No |
+| 15 Turns | 120 KB | [Pass/Fail] | [e.g. Session timed out/cleared] |
+| 25+ Turns | [Size] | [Pass/Fail] | [e.g. SWIG Memory Leak Warning Observed] |
+
+**What to watch for:**
+- "No model loaded" errors appearing mid-session after successful model operations
+- SWIG `memory leak of type 'boost::optional...'` warnings in stderr
+- Model state diverging (e.g., a zone renamed in turn 3 showing original name in turn 20)
+
+**Automated coverage:** `tests/test_session_persistence.py` (on `test/artifact-security-coverage` branch; 20+ sequential operations)
+
+---
+
+## 5. BEM Practitioner Workflow (Visual Case Study)
+
+### Workflow Name: [e.g. ASHRAE 90.1 Baseline Generation]
+
+**Objective:** [Briefly describe the practitioner's goal]
+
+**Step-by-Step Execution:**
+
+1. **User Request:** [Insert Prompt]
+2. **Skill Triggered:** [Insert Skill Name]
+3. **Tool Chain:** [List tools called in sequence]
+4. **Outcome:** [Brief summary of BEM result]
+
+**Visual Documentation:**
+
+> *Screenshot: LLM following the "Skill" Markdown instructions.*  
+> `![Skill Adherence](./images/skill_ui.png)`
+
+> *Screenshot: Final BEM result or 3D visualization verification.*  
+> `![BEM Verification](./images/result_viz.png)`
+
+---
+
+## 6. Security & Path Validation
+
+Quick check for path-traversal vulnerabilities or container leaks.
+
+- **[ ]** Attempted path traversal (`../../etc/passwd`) via `file_path`? **Result:** [Blocked/Allowed]
+- **[ ]** Attempted path traversal in `copy_file` `destination`? **Result:** [Blocked/Allowed]
+- **[ ]** Verified that `copy_file` stays within mounted volume? **Result:** [Yes/No]
+- **[ ]** Attempted to read `/repo` source code via `read_file`? **Result:** [Allowed — `/repo` is intentionally in allowed roots so skill guides and measure templates in the source tree are accessible; be aware this also exposes server source code]
+- **[ ]** Attempted `seed_file: "../../model.osm"` in OSW? **Result:** [Path flattened to basename]
+
+**Automated coverage:** `tests/test_path_safety.py`, `tests/test_artifact_limits.py`
+(both on `test/artifact-security-coverage` branch; see `TestCopyFilePathSafety` class)
+
+---
+
+## Summary & Recommendations
+
+| Finding | Severity | Recommended Action |
+|---|---|---|
+| [e.g. LLM skips list_skills on HVAC tasks] | Medium | [e.g. Add skill reminder to tool descriptions] |
+| [e.g. 2.5 MB HTML truncated, analysis lost] | High | [e.g. LLM should proactively use copy_file for >50 KB] |
diff --git a/mcp_server/server.py b/mcp_server/server.py
index 290156f..935f790 100644
--- a/mcp_server/server.py
+++ b/mcp_server/server.py
@@ -1,63 +1,68 @@
 from __future__ import annotations
 
-from fastmcp import FastMCP
-
 from mcp_server.config import ENABLE_CODE_MODE
 from mcp_server.skills import register_all_skills
 from mcp_server.stdout_suppression import (
     redirect_c_stdout_to_stderr,
     silence_openstudio_stdout_logger,
 )
-
-mcp = FastMCP(
-    "openstudio-mcp",
-    instructions=(
-        "Building energy simulation server (OpenStudio SDK) with 142 tools for "
-        "creating, modifying, simulating, and analyzing building energy models. "
-        "Use these tools for all building energy modeling tasks — if no tool "
-        "exists for a task, ask the user before writing code. "
-        "NEVER write scripts, code, or files to accomplish tasks that these "
-        "tools already handle. Specifically: "
-        "- Measures: ALWAYS use create_measure — never write measure.rb/.py/.xml "
-        "directly. create_measure handles scaffolding, XML, checksums, and "
-        "OS App compatibility. Workflow: create_measure → test_measure → apply_measure. "
-        "- Results/data: use extract_summary_metrics, extract_end_use_breakdown, "
-        "query_timeseries, extract_envelope_summary, extract_hvac_sizing — "
-        "never write Python/SQL scripts to parse eplusout.sql. "
-        "- Visualization: use view_model (3D geometry), view_simulation_data "
-        "(charts/heatmaps), generate_results_report (HTML report) — never write "
-        "matplotlib/plotly/HTML scripts. "
-        "- Models: use create_new_building, create_bar_building, import_floorspacejs "
-        "— never write raw IDF or OSM files. "
-        "- Weather: use change_building_location (sets EPW+DDY+CZ in one call) "
-        "or list_weather_files — never download or write weather files. "
-        "- HVAC: use add_baseline_system, add_doas_system, add_vrf_system — "
-        "never write OpenStudio SDK scripts to wire HVAC components. "
-        "For custom HVAC measures, call search_wiring_patterns to get working "
-        "Ruby wiring code, and search_api to verify methods exist. "
-        "If a file path is given, use it directly. If a file operation fails, "
-        "you may call list_files once to find the right path, then retry — "
-        "do not call list_files more than once for the same file. "
-        "Use list_weather_files for EPW discovery — do not use list_files for weather. "
-        "To find objects by type, use list_model_objects(object_type). "
-        "List tools default to 10 results — use filters to narrow, or "
-        "max_results=0 for all. Prefer list tools before detail tools to "
-        "find the right name. "
-        "When polling get_run_status, wait at least 1-2 minutes between calls. "
-        "For multi-step workflows, call list_skills() first."
-    ),
-)
-
-register_all_skills(mcp)
-
-if ENABLE_CODE_MODE:
-    from fastmcp.experimental.transforms.code_mode import CodeMode
-    mcp.add_transform(CodeMode())
+from mcp_server.telemetry import init_telemetry
 
 
 def main():
     silence_openstudio_stdout_logger()
     redirect_c_stdout_to_stderr()
+    # init_telemetry() must run before FastMCP is instantiated so that
+    # McpInstrumentor().instrument() can patch FastMCP.__init__ in time.
+    init_telemetry()
+
+    from fastmcp import FastMCP
+
+    mcp = FastMCP(
+        "openstudio-mcp",
+        instructions=(
+            "Building energy simulation server (OpenStudio SDK) with 142 tools for "
+            "creating, modifying, simulating, and analyzing building energy models. "
+            "Use these tools for all building energy modeling tasks — if no tool "
+            "exists for a task, ask the user before writing code. "
+            "NEVER write scripts, code, or files to accomplish tasks that these "
+            "tools already handle. Specifically: "
+            "- Measures: ALWAYS use create_measure — never write measure.rb/.py/.xml "
+            "directly. create_measure handles scaffolding, XML, checksums, and "
+            "OS App compatibility. Workflow: create_measure → test_measure → apply_measure. "
+            "- Results/data: use extract_summary_metrics, extract_end_use_breakdown, "
+            "query_timeseries, extract_envelope_summary, extract_hvac_sizing — "
+            "never write Python/SQL scripts to parse eplusout.sql. "
+            "- Visualization: use view_model (3D geometry), view_simulation_data "
+            "(charts/heatmaps), generate_results_report (HTML report) — never write "
+            "matplotlib/plotly/HTML scripts. "
+            "- Models: use create_new_building, create_bar_building, import_floorspacejs "
+            "— never write raw IDF or OSM files. "
+            "- Weather: use change_building_location (sets EPW+DDY+CZ in one call) "
+            "or list_weather_files — never download or write weather files. "
+            "- HVAC: use add_baseline_system, add_doas_system, add_vrf_system — "
+            "never write OpenStudio SDK scripts to wire HVAC components. "
+            "For custom HVAC measures, call search_wiring_patterns to get working "
+            "Ruby wiring code, and search_api to verify methods exist. "
+            "If a file path is given, use it directly. If a file operation fails, "
+            "you may call list_files once to find the right path, then retry — "
+            "do not call list_files more than once for the same file. "
+            "Use list_weather_files for EPW discovery — do not use list_files for weather. "
+            "To find objects by type, use list_model_objects(object_type). "
+            "List tools default to 10 results — use filters to narrow, or "
+            "max_results=0 for all. Prefer list tools before detail tools to "
+            "find the right name. "
+            "When polling get_run_status, wait at least 1-2 minutes between calls. "
+            "For multi-step workflows, call list_skills() first."
+        ),
+    )
+
+    register_all_skills(mcp)
+
+    if ENABLE_CODE_MODE:
+        from fastmcp.experimental.transforms.code_mode import CodeMode
+        mcp.add_transform(CodeMode())
+
     mcp.run()
 
 
diff --git a/mcp_server/skills/common_measures/wrappers.py b/mcp_server/skills/common_measures/wrappers.py
index 6b08625..caed813 100644
--- a/mcp_server/skills/common_measures/wrappers.py
+++ b/mcp_server/skills/common_measures/wrappers.py
@@ -11,6 +11,7 @@
 from typing import Any
 
 from mcp_server.skills.measures.operations import apply_measure
+from mcp_server.telemetry import traced
 
 
 def _ensure_climate_zone() -> None:
@@ -154,6 +155,7 @@ def generate_results_report_op(units: str = "IP", run_id: str | None = None) ->
 
 # --- 4. run_qaqc_checks: ASHRAE QA/QC ---
 
+@traced()
 def run_qaqc_checks_op(
     template: str = "90.1-2013",
     checks: list[str] | None = None,
diff --git a/mcp_server/skills/comstock/operations.py b/mcp_server/skills/comstock/operations.py
index 0a07fd0..03935a5 100644
--- a/mcp_server/skills/comstock/operations.py
+++ b/mcp_server/skills/comstock/operations.py
@@ -18,6 +18,7 @@
 from mcp_server.config import RUN_ROOT
 from mcp_server.model_manager import get_model
 from mcp_server.skills.measures.operations import apply_measure
+from mcp_server.telemetry import traced
 
 # Category classification for ComStock measures
 _BASELINE_PREFIXES = (
@@ -115,6 +116,7 @@ def list_comstock_measures(category: str | None = None) -> dict[str, Any]:
 }
 
 
+@traced()
 def create_typical_building(
     template: str = "90.1-2019",
     building_type: str = "SmallOffice",
@@ -299,6 +301,7 @@ def _create_empty_model() -> Path:
     return osm_path
 
 
+@traced()
 def create_bar_building(
     building_type: str = "SmallOffice",
     total_bldg_floor_area: float = 10000,
@@ -407,6 +410,7 @@ def create_bar_building(
     return result
 
 
+@traced()
 def create_new_building(
     # Bar geometry args
     building_type: str = "SmallOffice",
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 189ab9d..ffccda9 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -14,6 +14,7 @@
 import openstudio
 
 from mcp_server.config import INPUT_ROOT, RUN_ROOT
+from mcp_server.telemetry import traced
 
 CUSTOM_MEASURES_DIR = RUN_ROOT / "custom_measures"
 
@@ -740,6 +741,7 @@ def _write_test_file(measure_dir: Path, class_name: str, args: list[dict],
 
 # ── Public operations ────────────────────────────────────────────────
 
+@traced()
 def create_measure_op(
     name: str,
     description: str,
diff --git a/mcp_server/skills/measures/operations.py b/mcp_server/skills/measures/operations.py
index 52a0c4d..a1238a2 100644
--- a/mcp_server/skills/measures/operations.py
+++ b/mcp_server/skills/measures/operations.py
@@ -18,6 +18,7 @@
 
 from mcp_server.config import OSCLI_GEM_PATH, OSCLI_GEMFILE, RUN_ROOT
 from mcp_server.model_manager import get_model, load_model
+from mcp_server.telemetry import traced
 from mcp_server.util import resolve_run_dir
 
 
@@ -115,6 +116,7 @@ def _parse_runner_messages(out_osw_path: Path) -> dict[str, Any] | None:
         return None
 
 
+@traced()
 def apply_measure(
     measure_dir: str,
     arguments: dict[str, Any] | None = None,
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index 46bdfcd..34673bd 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -14,6 +14,7 @@
 import psutil
 
 from mcp_server.config import LOG_TAIL_DEFAULT, OSCLI_GEM_PATH, OSCLI_GEMFILE, RUN_ROOT
+from mcp_server.telemetry import traced
 from mcp_server.util import resolve_run_dir
 
 # Where the MCP server stores runs inside the container
@@ -603,6 +604,7 @@ def validate_model_op() -> dict[str, Any]:
     }
 
 
+@traced()
 def run_simulation(osm_path: str, epw_path: str | None = None, name: str | None = None) -> dict[str, Any]:
     """Create a minimal OSW from an OSM file and run the simulation.
 
diff --git a/mcp_server/telemetry.py b/mcp_server/telemetry.py
new file mode 100644
index 0000000..3691bc2
--- /dev/null
+++ b/mcp_server/telemetry.py
@@ -0,0 +1,250 @@
+"""OpenLLMetry (Traceloop) tracing for the openstudio-mcp server.
+
+Optional: a no-op unless traceloop-sdk is installed (included in [telemetry] extra).
+Zero overhead when absent: no import errors, all calls become pass-throughs.
+
+Install:
+    pip install 'openstudio-mcp[telemetry]'
+
+Environment variables:
+    TRACELOOP_BASE_URL    OTLP / Traceloop-compatible endpoint, e.g.:
+                            http://localhost:4318  (local OTEL collector)
+                            https://api.traceloop.com  (Traceloop cloud, needs API key)
+                          Unset -> telemetry disabled (no-op).
+    TRACELOOP_API_KEY     API key for Traceloop cloud (not required for generic OTLP).
+    OTEL_SERVICE_NAME     Service name emitted on every span. Default: "openstudio-mcp".
+    OTEL_EXPORT_BATCH     "false" -> sync exporting (dev). Default: batch mode.
+    TRACELOOP_TRACE_CONTENT  "false" -> omit tool args from spans (privacy).
+
+Usage:
+    from mcp_server.telemetry import init_telemetry, trace_operation, traced
+
+    # In main() before mcp.run():
+    init_telemetry()
+
+    # Decorate a key operation:
+    @traced()
+    def run_simulation(osm_path: str, ...) -> dict: ...
+
+    # Or use a context manager for finer control:
+    with trace_operation("prepare_model", {"path": osm_path}) as span:
+        result = do_work()
+"""
+from __future__ import annotations
+
+import importlib.util
+import json
+import logging
+import sys
+from contextlib import contextmanager
+from typing import Any, Callable, TypeVar
+
+logger = logging.getLogger(__name__)
+
+_TELEMETRY_INITIALIZED = False
+# True only after Traceloop.init() succeeds with a valid endpoint.
+# traced() checks this at call time to avoid traceloop stdout warnings.
+_TELEMETRY_ENABLED = False
+try:
+    _SDK_AVAILABLE = importlib.util.find_spec("traceloop.sdk") is not None
+except (ModuleNotFoundError, ValueError):
+    _SDK_AVAILABLE = False
+
+# Max chars for any single span attribute value.
+_MAX_ATTR_LEN = 512
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+class _NoopSpan:
+    """Minimal no-op span used when opentelemetry is not installed."""
+
+    def set_attribute(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    def set_status(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+    def record_exception(self, *args: Any, **kwargs: Any) -> None:
+        pass
+
+
+def init_telemetry() -> bool:
+    """Initialize OpenLLMetry tracing.  Idempotent — safe to call multiple times.
+
+    Returns True if telemetry was enabled, False otherwise (SDK absent, no endpoint).
+    When the SDK is installed and an endpoint is configured, calls
+    McpInstrumentor().instrument() to auto-trace every FastMCP tool call and
+    Traceloop.init() to configure the OTLP exporter.
+    """
+    global _TELEMETRY_INITIALIZED, _TELEMETRY_ENABLED
+
+    if _TELEMETRY_INITIALIZED:
+        return _TELEMETRY_ENABLED
+
+    import os
+
+    if not _SDK_AVAILABLE:
+        endpoint = os.environ.get("TRACELOOP_BASE_URL", "").strip()
+        if endpoint:
+            logger.warning(
+                "TRACELOOP_BASE_URL is set but traceloop-sdk is not installed. "
+                "Install telemetry extras: pip install 'openstudio-mcp[telemetry]'"
+            )
+        _TELEMETRY_INITIALIZED = True
+        return False
+
+    endpoint = os.environ.get("TRACELOOP_BASE_URL", "").strip()
+    if not endpoint:
+        logger.debug("TRACELOOP_BASE_URL not set -- telemetry disabled")
+        _TELEMETRY_INITIALIZED = True
+        return False
+
+    try:
+        from opentelemetry.instrumentation.mcp import McpInstrumentor
+        from traceloop.sdk import Traceloop
+
+        service_name = os.environ.get("OTEL_SERVICE_NAME", "openstudio-mcp")
+        disable_batch = os.environ.get("OTEL_EXPORT_BATCH", "true").lower() == "false"
+
+        # Initialize Traceloop FIRST so its TracerProvider is live before we
+        # patch FastMCP.  McpInstrumentor wraps FastMCP tool calls; if the
+        # provider isn't established yet those spans have nowhere to go.
+        # Traceloop.init() uses print() for status messages — redirect sys.stdout
+        # to stderr to avoid corrupting the MCP JSON-RPC stdio pipe.
+        _orig_stdout = sys.stdout
+        sys.stdout = sys.stderr
+        try:
+            Traceloop.init(
+                app_name=service_name,
+                api_endpoint=endpoint,
+                disable_batch=disable_batch,
+            )
+        finally:
+            sys.stdout = _orig_stdout
+
+        # Patch FastMCP AFTER the provider is live so auto-traced tool calls
+        # have a real exporter destination.
+        McpInstrumentor().instrument()
+
+        _TELEMETRY_INITIALIZED = True
+        _TELEMETRY_ENABLED = True
+        logger.info(
+            "OpenLLMetry enabled: endpoint=%s service=%s batch=%s",
+            endpoint,
+            service_name,
+            not disable_batch,
+        )
+        return True
+
+    except Exception:
+        logger.exception("Failed to initialize OpenLLMetry -- telemetry disabled")
+        _TELEMETRY_INITIALIZED = True
+        return False
+
+
+@contextmanager
+def trace_operation(name: str, attributes: dict[str, Any] | None = None):
+    """Context manager that wraps a block in a child INTERNAL span.
+
+    Uses the active OpenTelemetry TracerProvider (configured by Traceloop.init()).
+    Falls back to a no-op span when opentelemetry is not installed or telemetry
+    is not configured — safe to use unconditionally.
+
+    Args:
+        name: Span name, e.g. "prepare_model".
+        attributes: Optional initial attributes (values truncated to _MAX_ATTR_LEN).
+
+    Yields:
+        The active Span (may be a NonRecordingSpan or _NoopSpan when telemetry is off).
+
+    Example::
+
+        with trace_operation("apply_measure", {"measure_dir": measure_dir}) as span:
+            result = _do_apply(...)
+            span.set_attribute("ok", str(result.get("ok", False)))
+    """
+    try:
+        from opentelemetry import trace
+        from opentelemetry.trace import NonRecordingSpan, SpanKind, StatusCode
+    except ImportError:
+        yield _NoopSpan()
+        return
+
+    tracer = trace.get_tracer("openstudio-mcp")
+    with tracer.start_as_current_span(name, kind=SpanKind.INTERNAL) as span:
+        is_recording = not isinstance(span, NonRecordingSpan)
+        if is_recording and attributes:
+            for key, val in attributes.items():
+                span.set_attribute(key, _truncate(val))
+        try:
+            yield span
+        except Exception as exc:
+            if is_recording:
+                span.record_exception(exc)
+                span.set_status(StatusCode.ERROR, str(exc))
+            raise
+
+
+def traced(op_name: str | None = None) -> Callable[[F], F]:
+    """Decorator that wraps a synchronous operation in a trace span.
+
+    Uses trace_operation() context manager to create a span.  Only active when
+    telemetry has been successfully enabled via init_telemetry().  This avoids
+    traceloop stdout warnings when the SDK is installed but no endpoint is set.
+
+    Marks the span ERROR when the function returns a dict with ok=False.
+
+    Args:
+        op_name: Span name override.  Defaults to the function name.
+
+    Example::
+
+        @traced()
+        def run_simulation(osm_path: str, ...) -> dict: ...
+    """
+    import functools
+
+    def decorator(fn: F) -> F:
+        span_name = op_name or fn.__name__
+
+        @functools.wraps(fn)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            if not _TELEMETRY_ENABLED:
+                return fn(*args, **kwargs)
+
+            with trace_operation(span_name) as span:
+                result = fn(*args, **kwargs)
+                if isinstance(result, dict) and result.get("ok") is False:
+                    _mark_span_error(span, result)
+                return result
+
+        return wrapper  # type: ignore[return-value]
+
+    return decorator
+
+
+def _mark_span_error(span: Any, result: dict[str, Any]) -> None:
+    """Set ERROR status on the given span."""
+    try:
+        from opentelemetry.trace import NonRecordingSpan, StatusCode
+
+        if isinstance(span, NonRecordingSpan):
+            return
+        error_msg = result.get("error") or result.get("message") or "tool returned ok=False"
+        span.set_status(StatusCode.ERROR, str(error_msg))
+        span.set_attribute("error.message", str(error_msg)[:_MAX_ATTR_LEN])
+    except Exception:
+        pass
+
+
+def _truncate(value: Any) -> str:
+    """Serialize a value to a JSON string capped at _MAX_ATTR_LEN chars."""
+    try:
+        s = json.dumps(value, default=str)
+    except Exception:
+        s = str(value)
+    if len(s) > _MAX_ATTR_LEN:
+        return s[:_MAX_ATTR_LEN] + "..."
+    return s
+
diff --git a/pyproject.toml b/pyproject.toml
index 3e67bef..5e32ca8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,10 @@ dev = [
   "pytest-timeout>=2.3.1",
   "mcp",
   "pyyaml>=6.0",
+  "opentelemetry-sdk>=1.38.0",
+]
+telemetry = [
+  "traceloop-sdk>=0.49.2",
 ]
 
 [project.scripts]
diff --git a/tests/test_skill_ecm_package.py b/tests/test_skill_ecm_package.py
new file mode 100644
index 0000000..463ca65
--- /dev/null
+++ b/tests/test_skill_ecm_package.py
@@ -0,0 +1,237 @@
+"""Integration test for Example 20: Hackathon Deep Retrofit Package workflow.
+
+Exercises: create baseline → simulate → apply wall insulation ECM (all exterior
+walls) + thermostat widening ECM → re-simulate → compare_runs shows energy reduction.
+
+This validates the core ECM stacking workflow described in
+docs/examples/20_deep_retrofit_package.md.
+"""
+import asyncio
+import uuid
+
+import pytest
+from conftest import EPW_PATH, integration_enabled, poll_until_done, server_params, unwrap
+from mcp import ClientSession
+from mcp.client.stdio import stdio_client
+
+
+@pytest.mark.integration
+def test_skill_ecm_package_workflow():
+    """Hackathon ECM package: baseline sim → wall insulation + thermostat ECMs → compare."""
+    # Validates: multiple ECMs (exterior wall insulation + thermostat widening) can be
+    # stacked on a baseline model and produce a measurable energy reduction vs. baseline.
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+
+    async def _run():
+        async with stdio_client(server_params()) as (r, w):
+            async with ClientSession(r, w) as s:
+                await s.initialize()
+                name = f"ecm_pkg_{uuid.uuid4().hex[:8]}"
+
+                # ----------------------------------------------------------------
+                # Step 1: Create baseline model (System 3 PSZ-AC, 40% WWR)
+                # ----------------------------------------------------------------
+                cr = unwrap(await s.call_tool("create_baseline_osm", {
+                    "name": name,
+                    "ashrae_sys_num": "03",
+                    "wwr": 0.4,
+                }))
+                assert cr["ok"] is True, f"create_baseline_osm failed: {cr}"
+
+                lr = unwrap(await s.call_tool("load_osm_model", {
+                    "osm_path": cr["osm_path"],
+                }))
+                assert lr["ok"] is True
+
+                # Step 2: Set weather + design days (Boston TMY3)
+                wr = unwrap(await s.call_tool("change_building_location", {
+                    "weather_file": EPW_PATH,
+                }))
+                assert wr["ok"] is True, f"change_building_location failed: {wr}"
+
+                # Step 3: Save baseline and simulate
+                baseline_path = f"/runs/{name}_baseline.osm"
+                sr = unwrap(await s.call_tool("save_osm_model", {
+                    "osm_path": baseline_path,
+                }))
+                assert sr["ok"] is True
+
+                sim = unwrap(await s.call_tool("run_simulation", {
+                    "osm_path": baseline_path,
+                    "epw_path": EPW_PATH,
+                }))
+                assert sim["ok"] is True, f"baseline run_simulation failed: {sim}"
+                baseline_run_id = sim["run_id"]
+
+                status = await poll_until_done(s, baseline_run_id)
+                assert status["run"]["status"] == "success", f"Baseline sim failed: {status}"
+
+                # Step 4: Verify baseline has annual results (required for compare_runs)
+                baseline_metrics = unwrap(await s.call_tool("extract_summary_metrics", {
+                    "run_id": baseline_run_id,
+                }))
+                assert baseline_metrics["ok"] is True
+                b_metrics = baseline_metrics.get("metrics", baseline_metrics)
+                baseline_eui = b_metrics.get("eui_kBtu_ft2")
+                assert baseline_eui is not None, (
+                    "Baseline simulation must produce annual EUI results; "
+                    f"got metrics keys: {list(b_metrics.keys())}"
+                )
+                assert baseline_eui > 0, f"Baseline EUI should be positive, got {baseline_eui}"
+
+                # ----------------------------------------------------------------
+                # ECM 1: High-R wall insulation
+                # Create R-20 (IP) single-layer construction:
+                #   R-20 IP = 3.52 m²·K/W → thickness = R×k = 3.52×0.04 = 0.141 m
+                # ----------------------------------------------------------------
+                mat = unwrap(await s.call_tool("create_standard_opaque_material", {
+                    "name": "R20_Insulation",
+                    "thickness_m": 0.141,
+                    "conductivity_w_m_k": 0.04,
+                    "density_kg_m3": 30.0,
+                    "specific_heat_j_kg_k": 1000.0,
+                }))
+                assert mat["ok"] is True, f"create_standard_opaque_material failed: {mat}"
+
+                con = unwrap(await s.call_tool("create_construction", {
+                    "name": "High_R_Wall",
+                    "material_names": ["R20_Insulation"],
+                }))
+                assert con["ok"] is True, f"create_construction failed: {con}"
+
+                # Verify the construction R-value before applying:
+                # R-20 IP = 3.52 m²·K/W → thickness=0.141 m, k=0.04 W/m·K
+                con_details = unwrap(await s.call_tool("get_construction_details", {
+                    "construction_name": "High_R_Wall",
+                }))
+                assert con_details["ok"] is True
+                layers = con_details["construction"]["layers"]
+                assert len(layers) == 1, f"Expected 1 layer, got {len(layers)}"
+                layer = layers[0]
+                assert abs(layer["thickness_m"] - 0.141) < 0.001, (
+                    f"R-20 insulation thickness should be ~0.141 m, got {layer['thickness_m']}"
+                )
+                assert abs(layer["conductivity_w_m_k"] - 0.04) < 0.001, (
+                    f"R-20 insulation conductivity should be 0.04 W/m·K, got {layer['conductivity_w_m_k']}"
+                )
+
+                # Find all exterior walls and apply new construction
+                surfs = unwrap(await s.call_tool("list_surfaces", {
+                    "surface_type": "Wall",
+                    "boundary": "Outdoors",
+                    "max_results": 0,
+                }))
+                assert surfs["ok"] is True
+                ext_walls = surfs["surfaces"]
+                assert len(ext_walls) > 0, "Baseline model must have exterior walls"
+
+                for wall in ext_walls:
+                    assign = unwrap(await s.call_tool("assign_construction_to_surface", {
+                        "surface_name": wall["name"],
+                        "construction_name": "High_R_Wall",
+                    }))
+                    assert assign["ok"] is True, (
+                        f"assign_construction_to_surface failed for '{wall['name']}': {assign}"
+                    )
+
+                # Spot-check: verify the first wall's construction actually changed
+                spot_check = unwrap(await s.call_tool("get_surface_details", {
+                    "surface_name": ext_walls[0]["name"],
+                }))
+                assert spot_check["ok"] is True
+                assert spot_check["surface"]["construction"] == "High_R_Wall", (
+                    f"Wall '{ext_walls[0]['name']}' construction not updated: "
+                    f"got '{spot_check['surface']['construction']}'"
+                )
+
+                # ----------------------------------------------------------------
+                # ECM 2: Thermostat deadband widening
+                # Capture cooling schedule name before and after to confirm the
+                # measure cloned and modified the schedules (not a no-op).
+                # ----------------------------------------------------------------
+                zones_before = unwrap(await s.call_tool("list_thermal_zones", {
+                    "detailed": True,
+                    "max_results": 1,
+                }))
+                assert zones_before["ok"] is True
+                assert len(zones_before["thermal_zones"]) > 0
+                clg_sched_before = zones_before["thermal_zones"][0].get("cooling_setpoint_schedule")
+
+                ecm2 = unwrap(await s.call_tool("adjust_thermostat_setpoints", {
+                    "cooling_offset_f": 2.0,
+                    "heating_offset_f": -2.0,
+                }))
+                assert ecm2["ok"] is True, f"adjust_thermostat_setpoints failed: {ecm2}"
+
+                zones_after = unwrap(await s.call_tool("list_thermal_zones", {
+                    "detailed": True,
+                    "max_results": 1,
+                }))
+                assert zones_after["ok"] is True
+                clg_sched_after = zones_after["thermal_zones"][0].get("cooling_setpoint_schedule")
+                # The measure clones schedules — the name must change
+                if clg_sched_before is not None:
+                    assert clg_sched_after != clg_sched_before, (
+                        f"Thermostat cooling schedule was not updated by ECM 2: "
+                        f"schedule name unchanged ('{clg_sched_before}')"
+                    )
+
+                # ----------------------------------------------------------------
+                # Step 5: Save retrofit model and simulate
+                # ----------------------------------------------------------------
+                retrofit_path = f"/runs/{name}_retrofit.osm"
+                sr2 = unwrap(await s.call_tool("save_osm_model", {
+                    "osm_path": retrofit_path,
+                }))
+                assert sr2["ok"] is True
+
+                sim2 = unwrap(await s.call_tool("run_simulation", {
+                    "osm_path": retrofit_path,
+                    "epw_path": EPW_PATH,
+                }))
+                assert sim2["ok"] is True, f"retrofit run_simulation failed: {sim2}"
+                retrofit_run_id = sim2["run_id"]
+
+                status2 = await poll_until_done(s, retrofit_run_id)
+                assert status2["run"]["status"] == "success", f"Retrofit sim failed: {status2}"
+
+                # ----------------------------------------------------------------
+                # Step 6: Compare runs — ECMs should reduce energy
+                # ----------------------------------------------------------------
+                comparison = unwrap(await s.call_tool("compare_runs", {
+                    "baseline_run_id": baseline_run_id,
+                    "retrofit_run_id": retrofit_run_id,
+                }))
+                assert comparison["ok"] is True, f"compare_runs failed: {comparison}"
+                # Verify compare_runs used the correct run IDs
+                assert comparison["baseline"]["run_id"] == baseline_run_id
+                assert comparison["retrofit"]["run_id"] == retrofit_run_id
+                # ECM package should reduce EUI — delta must be negative
+                delta_eui = comparison.get("delta_eui_kBtu_ft2")
+                assert delta_eui is not None, (
+                    "compare_runs must produce a delta_eui_kBtu_ft2 when both runs have annual results"
+                )
+                assert delta_eui < 0, (
+                    f"ECM package should reduce EUI: delta_eui={delta_eui:.2f} kBtu/ft² "
+                    f"(expected negative)"
+                )
+
+                # Retrofit EUI must be lower than baseline (both ECMs reduce energy)
+                retro_metrics = unwrap(await s.call_tool("extract_summary_metrics", {
+                    "run_id": retrofit_run_id,
+                }))
+                assert retro_metrics["ok"] is True
+                r_metrics = retro_metrics.get("metrics", retro_metrics)
+                retrofit_eui = r_metrics.get("eui_kBtu_ft2")
+                assert retrofit_eui is not None, (
+                    "Retrofit simulation must produce annual EUI results; "
+                    f"got metrics keys: {list(r_metrics.keys())}"
+                )
+                # Consistent with compare_runs delta (within floating-point tolerance)
+                assert retrofit_eui < baseline_eui, (
+                    f"ECM package should reduce EUI: "
+                    f"baseline={baseline_eui:.2f}, retrofit={retrofit_eui:.2f} kBtu/ft²"
+                )
+
+    asyncio.run(_run())
diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py
new file mode 100644
index 0000000..8ea05cd
--- /dev/null
+++ b/tests/test_telemetry.py
@@ -0,0 +1,576 @@
+"""Unit tests for mcp_server/telemetry.py (OpenLLMetry / traceloop-sdk integration).
+
+These tests run without OpenStudio and without Docker.
+
+Validates:
+- Telemetry is a no-op when TRACELOOP_BASE_URL is not set
+- init_telemetry() calls McpInstrumentor().instrument() and Traceloop.init()
+- init_telemetry() is idempotent and returns correct value on second call
+- McpInstrumentor is only called when endpoint is configured
+- traced() is a no-op when telemetry is not enabled
+- traced() creates a span and marks ERROR on ok=False when telemetry is enabled
+- trace_operation() creates a child span when a TracerProvider is configured
+- _truncate() caps values at _MAX_ATTR_LEN
+- sys.stdout is restored even when Traceloop.init() raises
+- init_telemetry() handles Traceloop.init() exceptions gracefully
+
+Regression: these tests guard against the telemetry module breaking server
+startup or silently swallowing init errors.
+"""
+from __future__ import annotations
+
+import sys
+from contextlib import contextmanager
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+@contextmanager
+def _reset_telemetry_module():
+    """Force a clean re-import of telemetry so _TELEMETRY_INITIALIZED resets."""
+    mod_name = "mcp_server.telemetry"
+    old = sys.modules.pop(mod_name, None)
+    try:
+        yield
+    finally:
+        sys.modules.pop(mod_name, None)
+        if old is not None:
+            sys.modules[mod_name] = old
+
+
+def _make_in_memory_setup():
+    """Return (provider, exporter, tracer) using the OTel SDK InMemorySpanExporter."""
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+    tracer = provider.get_tracer("test")
+    return provider, exporter, tracer
+
+
+# ---------------------------------------------------------------------------
+# init_telemetry tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_init_no_endpoint_returns_false(monkeypatch):
+    # Validates: init_telemetry returns False when TRACELOOP_BASE_URL is unset, and does NOT call McpInstrumentor or Traceloop.init.
+    monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False)
+
+    mock_traceloop = MagicMock()
+    mock_instrumentor = MagicMock()
+
+    with _reset_telemetry_module():
+        with patch.dict("sys.modules", {
+            "traceloop": MagicMock(),
+            "traceloop.sdk": mock_traceloop,
+            "traceloop.sdk.decorators": MagicMock(),
+            "opentelemetry.instrumentation.mcp": mock_instrumentor,
+        }):
+            import mcp_server.telemetry as tel
+            tel._SDK_AVAILABLE = True
+            result = tel.init_telemetry()
+
+    assert result is False
+    mock_traceloop.Traceloop.init.assert_not_called()
+    # McpInstrumentor should NOT be called when no endpoint is set
+    mock_instrumentor.McpInstrumentor.assert_not_called()
+
+
+@pytest.mark.unit
+def test_init_no_sdk_returns_false(monkeypatch):
+    # Validates: init_telemetry returns False (no warning) when SDK is absent and no endpoint is configured.
+    monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False)
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._SDK_AVAILABLE = False
+        result = tel.init_telemetry()
+
+    assert result is False
+
+
+@pytest.mark.unit
+def test_init_sdk_missing_with_endpoint_logs_warning(monkeypatch, caplog):
+    # Validates: a warning is logged when endpoint is set but SDK is not installed.
+    import logging
+    monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318")
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._SDK_AVAILABLE = False
+        with caplog.at_level(logging.WARNING, logger="mcp_server.telemetry"):
+            tel.init_telemetry()
+
+    assert any("traceloop-sdk is not installed" in r.message for r in caplog.records)
+
+
+@pytest.mark.unit
+def test_init_instruments_mcp_and_calls_traceloop_init(monkeypatch):
+    # Validates: when endpoint is set, McpInstrumentor().instrument() and Traceloop.init() are both called.
+    monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318")
+    monkeypatch.setenv("OTEL_SERVICE_NAME", "test-svc")
+
+    mock_traceloop_class = MagicMock()
+    mock_instrumentor_class = MagicMock()
+    mock_instrumentor_instance = MagicMock()
+    mock_instrumentor_class.return_value = mock_instrumentor_instance
+
+    mock_otel_mcp_mod = MagicMock()
+    mock_otel_mcp_mod.McpInstrumentor = mock_instrumentor_class
+
+    mock_traceloop_mod = MagicMock()
+    mock_traceloop_mod.Traceloop = mock_traceloop_class
+
+    with _reset_telemetry_module():
+        with patch.dict("sys.modules", {
+            "traceloop": MagicMock(),
+            "traceloop.sdk": mock_traceloop_mod,
+            "traceloop.sdk.decorators": MagicMock(),
+            "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod,
+        }):
+            import mcp_server.telemetry as tel
+            tel._SDK_AVAILABLE = True
+            result = tel.init_telemetry()
+
+    assert result is True
+    mock_instrumentor_instance.instrument.assert_called_once()
+    mock_traceloop_class.init.assert_called_once()
+    _, kwargs = mock_traceloop_class.init.call_args
+    assert kwargs["app_name"] == "test-svc"
+    assert kwargs["api_endpoint"] == "http://localhost:4318"
+
+
+@pytest.mark.unit
+def test_init_idempotent(monkeypatch):
+    # Validates: calling init_telemetry twice only initializes once.
+    monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318")
+
+    mock_traceloop_class = MagicMock()
+    mock_instrumentor_class = MagicMock()
+    mock_instrumentor_instance = MagicMock()
+    mock_instrumentor_class.return_value = mock_instrumentor_instance
+
+    mock_otel_mcp_mod = MagicMock()
+    mock_otel_mcp_mod.McpInstrumentor = mock_instrumentor_class
+
+    mock_traceloop_mod = MagicMock()
+    mock_traceloop_mod.Traceloop = mock_traceloop_class
+
+    with _reset_telemetry_module():
+        with patch.dict("sys.modules", {
+            "traceloop": MagicMock(),
+            "traceloop.sdk": mock_traceloop_mod,
+            "traceloop.sdk.decorators": MagicMock(),
+            "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod,
+        }):
+            import mcp_server.telemetry as tel
+            tel._SDK_AVAILABLE = True
+            r1 = tel.init_telemetry()
+            r2 = tel.init_telemetry()
+
+    assert mock_traceloop_class.init.call_count == 1
+    assert r1 is True
+    assert r2 is True
+
+
+@pytest.mark.unit
+def test_init_idempotent_returns_false_when_disabled(monkeypatch):
+    # Validates: second call returns False (not True) when first call disabled telemetry.
+    monkeypatch.delenv("TRACELOOP_BASE_URL", raising=False)
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._SDK_AVAILABLE = True
+        r1 = tel.init_telemetry()
+        r2 = tel.init_telemetry()
+
+    assert r1 is False
+    assert r2 is False
+
+
+@pytest.mark.unit
+def test_init_disable_batch_flag(monkeypatch):
+    # Validates: OTEL_EXPORT_BATCH=false sets disable_batch=True in Traceloop.init.
+    monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318")
+    monkeypatch.setenv("OTEL_EXPORT_BATCH", "false")
+
+    mock_traceloop_class = MagicMock()
+    mock_traceloop_mod = MagicMock()
+    mock_traceloop_mod.Traceloop = mock_traceloop_class
+    mock_otel_mcp_mod = MagicMock()
+    mock_otel_mcp_mod.McpInstrumentor.return_value = MagicMock()
+
+    with _reset_telemetry_module():
+        with patch.dict("sys.modules", {
+            "traceloop": MagicMock(),
+            "traceloop.sdk": mock_traceloop_mod,
+            "traceloop.sdk.decorators": MagicMock(),
+            "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod,
+        }):
+            import mcp_server.telemetry as tel
+            tel._SDK_AVAILABLE = True
+            tel.init_telemetry()
+
+    _, kwargs = mock_traceloop_class.init.call_args
+    assert kwargs["disable_batch"] is True
+
+
+@pytest.mark.unit
+def test_init_restores_stdout_on_exception(monkeypatch):
+    # Validates: sys.stdout is restored even when Traceloop.init() raises.
+    monkeypatch.setenv("TRACELOOP_BASE_URL", "http://localhost:4318")
+
+    mock_traceloop_class = MagicMock()
+    mock_traceloop_class.init.side_effect = RuntimeError("init boom")
+    mock_traceloop_mod = MagicMock()
+    mock_traceloop_mod.Traceloop = mock_traceloop_class
+    mock_otel_mcp_mod = MagicMock()
+    mock_otel_mcp_mod.McpInstrumentor.return_value = MagicMock()
+
+    original_stdout = sys.stdout
+
+    with _reset_telemetry_module():
+        with patch.dict("sys.modules", {
+            "traceloop": MagicMock(),
+            "traceloop.sdk": mock_traceloop_mod,
+            "traceloop.sdk.decorators": MagicMock(),
+            "opentelemetry.instrumentation.mcp": mock_otel_mcp_mod,
+        }):
+            import mcp_server.telemetry as tel
+            tel._SDK_AVAILABLE = True
+            result = tel.init_telemetry()
+
+    assert result is False
+    assert sys.stdout is original_stdout
+
+
+# ---------------------------------------------------------------------------
+# _truncate tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_truncate_short_value():
+    # Validates: short values are returned unchanged.
+    from mcp_server.telemetry import _MAX_ATTR_LEN, _truncate
+    assert _truncate("hello") == '"hello"'
+    assert len(_truncate("hello")) < _MAX_ATTR_LEN
+
+
+@pytest.mark.unit
+def test_truncate_long_value():
+    # Validates: values longer than _MAX_ATTR_LEN are capped with ellipsis.
+    from mcp_server.telemetry import _MAX_ATTR_LEN, _truncate
+    long_val = "x" * 2000
+    result = _truncate(long_val)
+    assert len(result) <= _MAX_ATTR_LEN + 10  # small slack for the suffix
+    assert result.endswith("...")
+
+
+# ---------------------------------------------------------------------------
+# trace_operation tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_trace_operation_noop_when_no_provider():
+    # Validates: trace_operation is safe to call when no provider is configured.
+    from mcp_server.telemetry import trace_operation
+    ran = []
+    with trace_operation("test_op") as span:
+        ran.append(True)
+        # NonRecordingSpan.set_attribute is a no-op -- this must not crash
+        span.set_attribute("key", "value")
+    assert ran == [True]
+
+
+@pytest.mark.unit
+def test_trace_operation_noop_span_on_import_error():
+    # Validates: trace_operation yields a _NoopSpan when opentelemetry is absent.
+    # Regression: trace_operation() must not raise ImportError in production
+    # environments where dev extras (opentelemetry-api) are not installed.
+    import sys
+
+    from mcp_server.telemetry import _NoopSpan, trace_operation
+
+    # Simulate opentelemetry being absent
+    otel_keys = {"opentelemetry", "opentelemetry.trace"}
+    with patch.dict(sys.modules, dict.fromkeys(otel_keys, None)):
+        ran = []
+        with trace_operation("test_noop") as span:
+            ran.append(True)
+            assert isinstance(span, _NoopSpan)
+            span.set_attribute("key", "value")
+            span.set_status("ok")
+            span.record_exception(None)
+        assert ran == [True]
+
+
+@pytest.mark.unit
+def test_trace_operation_child_span():
+    # Validates: trace_operation creates a named child span when a TracerProvider is configured.
+    from opentelemetry import trace as otel_trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+
+    from mcp_server.telemetry import trace_operation
+    with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")):
+        with trace_operation("my_op", {"key": "val"}):
+            pass
+
+    spans = exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "my_op"
+    assert spans[0].attributes.get("key") == '"val"'
+
+
+@pytest.mark.unit
+def test_trace_operation_records_exception():
+    # Validates: trace_operation sets ERROR status when an exception is raised.
+    from opentelemetry import trace as otel_trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+    from opentelemetry.trace import StatusCode
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+
+    from mcp_server.telemetry import trace_operation
+    with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")):
+        with pytest.raises(ValueError):
+            with trace_operation("failing_op"):
+                raise ValueError("boom")
+
+    spans = exporter.get_finished_spans()
+    assert spans[0].status.status_code == StatusCode.ERROR
+
+
+# ---------------------------------------------------------------------------
+# traced() decorator tests
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_traced_noop_when_telemetry_disabled():
+    # Validates: traced() wrapper calls the original function directly when _TELEMETRY_ENABLED is False (SDK installed but no endpoint configured).
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._TELEMETRY_ENABLED = False
+
+        call_log: list[str] = []
+
+        def my_fn(x: int) -> dict:
+            call_log.append("called")
+            return {"ok": True, "value": x}
+
+        wrapped = tel.traced()(my_fn)
+        result = wrapped(42)
+
+    assert result == {"ok": True, "value": 42}
+    assert call_log == ["called"]
+
+
+@pytest.mark.unit
+def test_traced_creates_span_when_enabled():
+    # Validates: traced() creates a span via trace_operation when _TELEMETRY_ENABLED is True.
+    from opentelemetry import trace as otel_trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._TELEMETRY_ENABLED = True
+
+        @tel.traced(op_name="custom_name")
+        def my_fn() -> dict:
+            return {"ok": True}
+
+        with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")):
+            result = my_fn()
+
+    assert result == {"ok": True}
+    spans = exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "custom_name"
+
+
+@pytest.mark.unit
+def test_traced_marks_error_on_ok_false():
+    # Validates: traced() marks the span ERROR when result has ok=False.
+    from opentelemetry import trace as otel_trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+    from opentelemetry.trace import StatusCode
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._TELEMETRY_ENABLED = True
+
+        @tel.traced()
+        def failing_op() -> dict:
+            return {"ok": False, "error": "something failed"}
+
+        with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")):
+            result = failing_op()
+
+    assert result == {"ok": False, "error": "something failed"}
+    spans = exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].status.status_code == StatusCode.ERROR
+    assert spans[0].attributes.get("error.message") == "something failed"
+
+
+@pytest.mark.unit
+def test_traced_uses_function_name_as_default_span_name():
+    # Validates: traced() uses the function name when op_name is not specified.
+    from opentelemetry import trace as otel_trace
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
+
+    exporter = InMemorySpanExporter()
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+
+    with _reset_telemetry_module():
+        import mcp_server.telemetry as tel
+        tel._TELEMETRY_ENABLED = True
+
+        @tel.traced()
+        def run_simulation(osm_path: str) -> dict:
+            return {"ok": True}
+
+        with patch.object(otel_trace, "get_tracer", return_value=provider.get_tracer("t")):
+            run_simulation("/tmp/test.osm")
+
+    spans = exporter.get_finished_spans()
+    assert len(spans) == 1
+    assert spans[0].name == "run_simulation"
+
+
+# ---------------------------------------------------------------------------
+# Startup wiring regression — init_telemetry() must appear before FastMCP
+# is instantiated in server.main(). Checked via AST so this runs without
+# importing server.py (which requires /runs to exist and openstudio).
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_server_main_calls_init_telemetry_before_fastmcp():
+    # Regression: init_telemetry() must be called before FastMCP is
+    # instantiated inside server.main(). If the order is reversed,
+    # McpInstrumentor cannot patch FastMCP.__init__ and all auto-
+    # instrumentation silently stops working.
+    import ast
+    from pathlib import Path
+
+    server_src = (Path(__file__).parent.parent / "mcp_server" / "server.py").read_text()
+    tree = ast.parse(server_src)
+
+    main_fn = next(
+        (n for n in ast.walk(tree)
+         if isinstance(n, ast.FunctionDef) and n.name == "main"),
+        None,
+    )
+    assert main_fn is not None, "main() not found in mcp_server/server.py"
+
+    init_telemetry_line = None
+    fastmcp_line = None
+
+    for node in ast.walk(main_fn):
+        if (
+            isinstance(node, ast.Expr)
+            and isinstance(node.value, ast.Call)
+            and isinstance(node.value.func, ast.Name)
+            and node.value.func.id == "init_telemetry"
+        ):
+            init_telemetry_line = node.lineno
+
+        if isinstance(node, ast.Call):
+            func = node.func
+            if (isinstance(func, ast.Name) and func.id == "FastMCP") or (
+                isinstance(func, ast.Attribute) and func.attr == "FastMCP"
+            ):
+                fastmcp_line = node.lineno
+
+    assert init_telemetry_line is not None, (
+        "init_telemetry() call not found in server.main(). "
+        "It must be called before FastMCP is instantiated."
+    )
+    assert fastmcp_line is not None, "FastMCP() instantiation not found in server.main()."
+    assert init_telemetry_line < fastmcp_line, (
+        f"init_telemetry() (line {init_telemetry_line}) must come BEFORE "
+        f"FastMCP() (line {fastmcp_line}) in server.main(). "
+        "McpInstrumentor patches FastMCP.__init__ during init_telemetry(); "
+        "reversing the order silently disables all auto-instrumentation."
+    )
+
+
+# ---------------------------------------------------------------------------
+# Decorator coverage regression — @traced() must remain on all promised ops.
+# Checked via AST to avoid importing skill modules that require openstudio.
+# ---------------------------------------------------------------------------
+
+@pytest.mark.unit
+def test_traced_decorator_applied_to_promised_operations():
+    # Regression: the CHANGELOG and README both promise that specific operations
+    # emit spans. This test guards against accidental decorator removal by
+    # inspecting the source AST of each operations file.
+    import ast
+    from pathlib import Path
+
+    repo_root = Path(__file__).parent.parent
+    expected = [
+        ("mcp_server/skills/simulation/operations.py", "run_simulation"),
+        ("mcp_server/skills/measures/operations.py", "apply_measure"),
+        ("mcp_server/skills/measure_authoring/operations.py", "create_measure_op"),
+        ("mcp_server/skills/comstock/operations.py", "create_typical_building"),
+        ("mcp_server/skills/comstock/operations.py", "create_bar_building"),
+        ("mcp_server/skills/comstock/operations.py", "create_new_building"),
+        ("mcp_server/skills/common_measures/wrappers.py", "run_qaqc_checks_op"),
+    ]
+
+    missing = []
+    for rel_path, func_name in expected:
+        src = (repo_root / rel_path).read_text()
+        tree = ast.parse(src)
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)) and node.name == func_name:
+                has_traced = any(
+                    (isinstance(d, ast.Call) and isinstance(d.func, ast.Name) and d.func.id == "traced")
+                    or (isinstance(d, ast.Call) and isinstance(d.func, ast.Attribute) and d.func.attr == "traced")
+                    for d in node.decorator_list
+                )
+                if not has_traced:
+                    missing.append(f"{rel_path}::{func_name}")
+                break
+        else:
+            missing.append(f"{rel_path}::{func_name} (function not found)")
+
+    assert not missing, (
+        f"The following operations are missing @traced() decoration: {missing}. "
+        "Every operation listed in the CHANGELOG/README tracing section must be "
+        "wrapped with @traced() so it emits a span when telemetry is enabled."
+    )