diff --git a/benchmark/CLAUDE.threefile.md b/benchmark/CLAUDE.threefile.md new file mode 100644 index 0000000..5d010ff --- /dev/null +++ b/benchmark/CLAUDE.threefile.md @@ -0,0 +1,25 @@ +# Django Source — supermodel three-file shards enabled + +This is the Django framework source. The auth package is at `django/contrib/auth/`. + +## Graph shard files + +`supermodel analyze --three-file` has run on this repo. Every source file has +three shard files with pre-computed context: + +- `.calls.py` — function call relationships (who calls what, with file and line number) +- `.deps.py` — import dependencies (what this file imports and what imports it) +- `.impact.py` — blast radius (risk level, affected domains, direct/transitive dependents) + +**Read the shard files before the source file.** They show you the full +picture in far fewer tokens. For example: + +- Wondering what `django/contrib/auth/__init__.py` calls and who calls it? + → read `django/contrib/auth/__init__.calls.py` +- Need to know what this module depends on? + → read `django/contrib/auth/__init__.deps.py` +- Want to assess blast radius before changing something? + → read `django/contrib/auth/__init__.impact.py` + +Use the shard files to navigate efficiently. Only drop into the source when you +need implementation details the shards don't cover. diff --git a/benchmark/Dockerfile.threefile b/benchmark/Dockerfile.threefile new file mode 100644 index 0000000..dfa938c --- /dev/null +++ b/benchmark/Dockerfile.threefile @@ -0,0 +1,46 @@ +# Benchmark container: Claude Code + supermodel --three-file on django/django +# Build from repo root: docker build -f benchmark/Dockerfile.threefile -t bench-threefile . + +# Stage 1: Build supermodel binary +FROM golang:alpine AS supermodel-builder +ENV GOTOOLCHAIN=auto +WORKDIR /build +COPY . . +RUN go build \ + -ldflags="-s -w -X github.com/supermodeltools/cli/internal/build.Version=benchmark" \ + -o /build/supermodel \ + . + +# Stage 2: Runtime +FROM python:3.12-slim + +# System deps + Node.js 20 +RUN apt-get update && apt-get install -y curl ca-certificates git && \ + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ + apt-get install -y nodejs && \ + rm -rf /var/lib/apt/lists/* + +# Install Claude Code + supermodel +RUN npm install -g @anthropic-ai/claude-code +COPY --from=supermodel-builder /build/supermodel /usr/local/bin/supermodel + +# Clone Django source at a fixed tag +RUN git clone --depth=1 --branch 5.0.6 \ + https://github.com/django/django.git /app + +# Install Django in editable mode +RUN pip install --no-cache-dir -e /app + +# Drop in the change_tracking test app +COPY benchmark/change_tracking/ /app/tests/change_tracking/ + +# Copy task + CLAUDE.md +COPY benchmark/task.md /benchmark/task.md +COPY benchmark/CLAUDE.threefile.md /app/CLAUDE.md + +# Non-root user +RUN useradd -m -s /bin/bash bench && chown -R bench:bench /app /benchmark +USER bench + +COPY benchmark/entrypoint.threefile.sh /entrypoint.sh +ENTRYPOINT ["/bin/bash", "/entrypoint.sh"] diff --git a/benchmark/entrypoint.threefile.sh b/benchmark/entrypoint.threefile.sh new file mode 100755 index 0000000..9e7d99d --- /dev/null +++ b/benchmark/entrypoint.threefile.sh @@ -0,0 +1,54 @@ +#!/bin/bash +set -euo pipefail + +RUN_TESTS="python tests/runtests.py --settings=test_sqlite change_tracking" + +echo "============================================================" +echo "BENCHMARK: Claude Code + supermodel --three-file — django/django" +echo "============================================================" +echo + +echo "--- Initial test run (all 8 should FAIL/ERROR) ---" +cd /app +PYTHONPATH=tests $RUN_TESTS -v 0 2>&1 | tail -3 || true +echo + +echo "--- Running supermodel analyze --three-file ---" +supermodel analyze --three-file /app 2>&1 | tee /tmp/supermodel_analyze.txt +echo + +echo "--- Wiring supermodel hook ---" +mkdir -p ~/.claude +cat > ~/.claude/settings.json <<'JSON' +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Write|Edit", + "hooks": [{ "type": "command", "command": "supermodel hook" }] + } + ] + } +} +JSON + +echo "--- Running Claude Code on task ---" +cd /app +claude \ + --print "$(cat /benchmark/task.md)" \ + --dangerously-skip-permissions \ + --output-format stream-json \ + --verbose \ + 2>&1 | tee /tmp/claude_raw.txt + +echo +echo "============================================================" +echo "TEST RESULTS" +echo "============================================================" +PYTHONPATH=tests $RUN_TESTS -v 2 2>&1 | tee /tmp/test_results.txt + +echo +echo "============================================================" +echo "COST SUMMARY" +echo "============================================================" +grep '"costUSD"\|"total_cost_usd"' /tmp/claude_raw.txt 2>/dev/null | tail -3 || echo "(check log)" diff --git a/benchmark/run.sh b/benchmark/run.sh index a817f6e..eb4ca7e 100755 --- a/benchmark/run.sh +++ b/benchmark/run.sh @@ -38,6 +38,13 @@ docker build \ "$REPO_ROOT" \ 2>&1 | tail -3 +echo "==> Building bench-threefile (three-file shard format)..." +docker build \ + -f "$SCRIPT_DIR/Dockerfile.threefile" \ + -t bench-threefile \ + "$REPO_ROOT" \ + 2>&1 | tail -3 + echo # ── Run containers ──────────────────────────────────────────────────────────── @@ -56,6 +63,14 @@ docker run --rm \ bench-supermodel \ 2>&1 | tee "$RESULTS_DIR/supermodel.txt" +echo +echo "==> Running three-file container..." +docker run --rm \ + -e ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" \ + -e SUPERMODEL_API_KEY="$SUPERMODEL_API_KEY" \ + bench-threefile \ + 2>&1 | tee "$RESULTS_DIR/threefile.txt" + echo echo "==> Comparing results..." -"$SCRIPT_DIR/compare.sh" "$RESULTS_DIR/naked.txt" "$RESULTS_DIR/supermodel.txt" +"$SCRIPT_DIR/compare.sh" "$RESULTS_DIR/naked.txt" "$RESULTS_DIR/supermodel.txt" "$RESULTS_DIR/threefile.txt"