diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ad0cb00..69166a4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -180,6 +180,36 @@ jobs:
fi
echo "E2E tests passed!"
+ benchmark-smoke:
+ name: Benchmark Suite Smoke Test
+ runs-on: ubuntu-24.04
+ continue-on-error: true
+ needs: [test]
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Install system dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y protobuf-compiler libclang-dev
+
+ - name: Install Rust
+ uses: dtolnay/rust-toolchain@stable
+
+ - name: Cache cargo registry
+ uses: Swatinem/rust-cache@v2
+ with:
+ shared-key: "bench-smoke"
+
+ - name: Build memory-bench
+ run: cargo build -p memory-bench
+
+ - name: Smoke test (help only — no daemon required)
+ run: |
+ cargo run -p memory-bench -- --help
+ cargo run -p memory-bench -- all --help
+ cargo run -p memory-bench -- locomo --help
+
# Summary job that depends on all other jobs
ci-success:
name: CI Success
diff --git a/.gitignore b/.gitignore
index f0cb711..967f271 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,6 @@ coverage/
# Local Cargo configuration (platform-specific)
.cargo/
+
+# LOCOMO benchmark dataset — download separately via benchmarks/scripts/download-locomo.sh
+locomo-data/
diff --git a/.planning/REQUIREMENTS.md b/.planning/REQUIREMENTS.md
index adf2dfb..f64a156 100644
--- a/.planning/REQUIREMENTS.md
+++ b/.planning/REQUIREMENTS.md
@@ -33,14 +33,14 @@ Requirements for the Competitive Parity & Benchmarks milestone. Each maps to roa
### Benchmark Suite (BENCH)
-- [ ] **BENCH-01**: Custom benchmark harness with TOML fixture files (temporal, multisession, compression)
-- [ ] **BENCH-02**: `memory benchmark temporal|multisession|compression|all` subcommands
-- [ ] **BENCH-03**: Benchmark reports accuracy, recall@5, token_usage, latency_p50/p95, compression ratio
-- [ ] **BENCH-04**: LOCOMO adapter ingests Snap Research dataset and produces `results.json` with aggregate score
-- [ ] **BENCH-05**: `--compare` flag reads `benchmarks/baselines.toml` and prints side-by-side competitor table
-- [ ] **BENCH-06**: `locomo-data/` in `.gitignore` — dataset never committed
-- [ ] **BENCH-07**: CI runs benchmark suite (non-blocking, skips LOCOMO without `--dataset` flag)
-- [ ] **BENCH-08**: JSON + markdown report output for all benchmark types
+- [x] **BENCH-01**: Custom benchmark harness with TOML fixture files (temporal, multisession, compression)
+- [x] **BENCH-02**: `memory benchmark temporal|multisession|compression|all` subcommands
+- [x] **BENCH-03**: Benchmark reports accuracy, recall@5, token_usage, latency_p50/p95, compression ratio
+- [x] **BENCH-04**: LOCOMO adapter ingests Snap Research dataset and produces `results.json` with aggregate score
+- [x] **BENCH-05**: `--compare` flag reads `benchmarks/baselines.toml` and prints side-by-side competitor table
+- [x] **BENCH-06**: `locomo-data/` in `.gitignore` — dataset never committed
+- [x] **BENCH-07**: CI runs benchmark suite (non-blocking, skips LOCOMO without `--dataset` flag)
+- [x] **BENCH-08**: JSON + markdown report output for all benchmark types
## Future Requirements (v3.1+)
@@ -81,14 +81,14 @@ Requirements for the Competitive Parity & Benchmarks milestone. Each maps to roa
| CLI-08 | Phase 52 | Complete |
| CLI-09 | Phase 52 | Complete |
| CLI-10 | Phase 52 | Complete |
-| BENCH-01 | Phase 53 | Pending |
-| BENCH-02 | Phase 53 | Pending |
-| BENCH-03 | Phase 53 | Pending |
-| BENCH-04 | Phase 53 | Pending |
-| BENCH-05 | Phase 53 | Pending |
-| BENCH-06 | Phase 53 | Pending |
-| BENCH-07 | Phase 53 | Pending |
-| BENCH-08 | Phase 53 | Pending |
+| BENCH-01 | Phase 53 | Complete |
+| BENCH-02 | Phase 53 | Complete |
+| BENCH-03 | Phase 53 | Complete |
+| BENCH-04 | Phase 53 | Complete |
+| BENCH-05 | Phase 53 | Complete |
+| BENCH-06 | Phase 53 | Complete |
+| BENCH-07 | Phase 53 | Complete |
+| BENCH-08 | Phase 53 | Complete |
**Coverage:**
- v3.0 requirements: 26 total
diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md
index ecb2eb1..12e0bd7 100644
--- a/.planning/ROADMAP.md
+++ b/.planning/ROADMAP.md
@@ -143,8 +143,9 @@ See: `.planning/milestones/v2.7-ROADMAP.md`
- [x] **Phase 51: Retrieval Orchestrator** - Query expansion, RRF fusion, LLM reranking, and context building as a new crate wrapping RetrievalExecutor (merged 2026-04-28 via PR #28)
- [x] **Phase 51.5: API Summarizer Wiring** - Wire `ApiSummarizer` from config (out-of-band; merged 2026-04-28 via PR #27)
-- [x] **Phase 52: Simple CLI API** - New `memory` binary with search, context, recall, add, timeline, summary subcommands (PR in review 2026-05-12)
-- [ ] **Phase 53: Benchmark Suite** - Custom TOML-fixture harness with LOCOMO adapter and publishable scoring
+- [x] **Phase 52: Simple CLI API** - New `memory` binary with search, context, recall, add, timeline, summary subcommands (merged 2026-05-14 via PR #29)
+- [x] **Phase 53.5: Cross-Project Federation** - Federated query across multiple project stores (out-of-band; merged 2026-05-14 via PR #25)
+- [x] **Phase 53: Benchmark Suite** - Custom TOML-fixture harness with LOCOMO adapter and publishable scoring (PR in review 2026-05-14)
## Phase Details
@@ -202,11 +203,12 @@ Plans:
3. Running `memory benchmark --compare` reads `benchmarks/baselines.toml` and prints a side-by-side competitor comparison table
4. Benchmark output is available in both JSON and Markdown report formats
5. CI runs the benchmark suite without blocking (LOCOMO skipped when `--dataset` flag is absent); `locomo-data/` is gitignored
-**Plans**: TBD
+**Plans**: 3 plans
Plans:
-- [ ] 53-01: TBD
-- [ ] 53-02: TBD
+- [ ] 53-01-PLAN.md — Scaffold crate, fixture format, TOML loader, and benchmark data files
+- [ ] 53-02-PLAN.md — Runner, scorer, report, baseline comparison, and CLI wiring
+- [ ] 53-03-PLAN.md — LOCOMO adapter and full QA verification
## Progress
@@ -224,13 +226,13 @@ Phases execute in numeric order: 51 -> 51.5 (merged out-of-band) -> 52 -> 53
| v2.5 Semantic Dedup | 35-38 | 11/11 | Complete | 2026-03-10 |
| v2.6 Cognitive Retrieval | 39-44 | 13/13 | Complete | 2026-03-16 |
| v2.7 Multi-Runtime Portability | 45-50 | 11/11 | Complete | 2026-03-22 |
-| v3.0 Competitive Parity | 51-53 + 51.5, 53.5 | 5/TBD | In progress | Phase 51 + 51.5 + 52 merged; Phase 53.5 (cross-project) in PR review |
+| v3.0 Competitive Parity | 51-53 + 51.5, 53.5 | 6/TBD | In progress | Phase 51 + 51.5 + 52 + 53.5 merged; Phase 53 (Benchmark Suite) in PR review |
---
## v3.0 Cross-Project Federation (out-of-band)
-> Branch: `feature/v3.0-cross-project-memory` (PR #25)
+> Merged via PR #25 (2026-05-14)
### Phase 53.5: Cross-Project Federation Core (1/1 plan) — COMPLETE 2026-04-10
@@ -249,4 +251,4 @@ Out-of-band insertion (mirrors Phase 51.5 pattern). Originally planned as Phase
---
-*Updated: 2026-05-14 — Phase 52 merged via PR #29; Phase 53.5 (cross-project federation) under review via PR #25*
+*Updated: 2026-05-14 — Phase 53 (Benchmark Suite) opening PR to close v3.0*
diff --git a/.planning/STATE.md b/.planning/STATE.md
index 5079a5b..f5d827f 100644
--- a/.planning/STATE.md
+++ b/.planning/STATE.md
@@ -4,13 +4,13 @@ milestone_name: Competitive Parity & Benchmarks
status: in_progress
stopped_at: null
last_updated: "2026-05-14T00:00:00.000Z"
-last_activity: 2026-05-14 — Phase 52 merged via PR #29; Phase 53.5 (cross-project) re-rebased for merge
+last_activity: 2026-05-14 — Phase 53 (Benchmark Suite) rebased onto main; opening PR closes v3.0
progress:
- total_phases: 4
- completed_phases: 3
- total_plans: 7
- completed_plans: 7
- percent: 75
+ total_phases: 5
+ completed_phases: 4
+ total_plans: 8
+ completed_plans: 8
+ percent: 100
---
# Project State
@@ -20,30 +20,35 @@ progress:
See: .planning/PROJECT.md (updated 2026-03-22)
**Core value:** Agent can answer "what were we talking about last week?" without scanning everything
-**Current focus:** v3.0 Phase 52 — Simple CLI API (PR review)
+**Current focus:** v3.0 Phase 53 — Benchmark Suite (PR review; closes v3.0)
## Current Position
-Phase: 53.5 of 53 (cross-project federation, out-of-band) — landing via PR #25
-Plan: 1 of 1 complete (53.5-01 cross-project federated query)
-Status: Phase 51 + 51.5 + 52 merged; Phase 53.5 (cross-project) merging next; Phase 53 (Benchmark Suite) still pending
-Last activity: 2026-05-14 — Phase 52 merged via PR #29; PR #25 re-rebased onto post-Phase-52 main
+Phase: 53 of 53 (Benchmark Suite) — opening PR
+Plan: 3 of 3 complete (53-01 foundation, 53-02 runner/scorer/CLI, 53-03 LOCOMO adapter)
+Status: Phase 51 + 51.5 + 52 + 53.5 merged; Phase 53 (Benchmark Suite) PR opens; v3.0 fully shipped on merge
+Last activity: 2026-05-14 — Rebased gsd/phase-53-benchmark-suite onto post-Phase-53.5 main; opening PR
-Progress: [████████░░] 75% (3 of 4 phases)
+Progress: [██████████] 100% (4 of 4 phases; Phase 53 PR pending)
## Out-of-band Work
### Open PRs
-| PR | Branch | Status | Reviewed | Notes |
-|---|---|---|---|---|
-| #25 | `feature/v3.0-cross-project-memory` | Open, CI green | Not yet | Recorded as Phase 53.5 (decimal-phase pattern, mirrors 51.5); rebased onto main 2026-05-08 |
-| #27 | merged 2026-04-28 as `3a73582` | Merged | — | Recorded as Phase 51.5; supersedes closed PR #26 |
-| #28 | merged 2026-04-28 as `85f3303` | Merged | — | Phase 51 Retrieval Orchestrator |
+(none — Phase 53 PR opening shortly)
-### Local-only Branches (still stacked, pending PRs)
+### Recently Merged
-- `gsd/phase-{53..58}` — 6-phase stack of GSD work covering remaining v3.0 (Phase 53 Benchmark Suite), v3.1 (Phases 54-56), and v3.2 (Phases 57-58). Each branch backed up to origin 2026-05-12 (no PRs). Pending strategic decision: per-milestone PRs vs. per-phase. **Note:** the planning files on these branches describe v3.0/v3.1 as "shipped" — that reflects local execution intent, not origin/main reality.
+| PR | What | Merged |
+|---|---|---|
+| #25 | Phase 53.5: cross-project federated query | 2026-05-14 |
+| #29 | Phase 52: Simple CLI API | 2026-05-14 |
+| #28 | Phase 51: Retrieval Orchestrator | 2026-04-28 |
+| #27 | Phase 51.5: API summarizer wiring | 2026-04-27 |
+
+### Local-only Branches (still stacked)
+
+- `gsd/phase-{54..58}` — 5-phase stack of GSD work covering v3.1 (Phases 54-56: export/backup/import) and v3.2 (Phases 57-58: runtime registration). Each branch backed up to origin 2026-05-12 (no PRs). Pending strategic decision: per-milestone PRs vs. per-phase. **Note:** the planning files on these branches describe v3.0/v3.1 as "shipped" — that reflects local execution intent, not origin/main reality.
## Performance Metrics
@@ -77,6 +82,9 @@ See .planning/MILESTONES.md
- [Phase 53.5]: Project attribution stored in `metadata["project"]` — same convention as `metadata["agent"]` from v2.1
- [Phase 53.5]: `federated_query` is a pure function — matches existing `enrich_with_salience` pattern
- [Phase 53.5]: `open_read_only` uses `DB::open_cf_for_read_only` from rocksdb 0.22 with `create_if_missing(false)`
+- [Phase 53]: New `memory-bench` crate with TOML fixture loader, runner/scorer/report/baseline modules, and LOCOMO adapter
+- [Phase 53]: Benchmark dataset (LOCOMO) gitignored — adapter loads from local path; never committed
+- [Phase 53]: CI benchmark smoke test added to verify the harness runs (not the full LOCOMO score)
## Blockers
@@ -105,9 +113,9 @@ See: .planning/MILESTONES.md for complete history
## Cumulative Stats
-- ~58,400 LOC Rust across 16 crates (memory-orchestrator from Phase 51, memory-cli from Phase 52) + federated module in memory-service (Phase 53.5)
-- 52 phases (Phase 1-52 + 53.5), 154 plans across 9 milestones
-- 50+ E2E tests + 144 bats CLI tests + orchestrator + memory-cli + 9 federated unit tests + 4 cross-project e2e tests
+- ~60,000 LOC Rust across 17 crates (memory-orchestrator, memory-cli, memory-bench all new in v3.0)
+- 53 phases (Phase 1-53 + 51.5 + 53.5), 157 plans across 9 milestones
+- 50+ E2E tests + 144 bats CLI tests + orchestrator + memory-cli + memory-bench tests + 9 federated unit tests + 4 cross-project e2e tests + CI benchmark smoke test
## Session Continuity
diff --git a/.planning/phases/53-benchmark-suite/53-01-PLAN.md b/.planning/phases/53-benchmark-suite/53-01-PLAN.md
new file mode 100644
index 0000000..7ef421c
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-01-PLAN.md
@@ -0,0 +1,377 @@
+---
+phase: 53-benchmark-suite
+plan: 01
+type: execute
+wave: 1
+depends_on: []
+files_modified:
+ - Cargo.toml
+ - .gitignore
+ - crates/memory-bench/Cargo.toml
+ - crates/memory-bench/src/main.rs
+ - crates/memory-bench/src/lib.rs
+ - crates/memory-bench/src/fixture.rs
+ - benchmarks/baselines.toml
+ - benchmarks/scripts/download-locomo.sh
+ - benchmarks/fixtures/temporal-001.toml
+ - benchmarks/fixtures/multisession-001.toml
+ - benchmarks/fixtures/compression-001.toml
+ - benchmarks/fixtures/sessions/auth-decision.jsonl
+ - benchmarks/fixtures/sessions/bug-fix.jsonl
+ - benchmarks/fixtures/sessions/follow-up.jsonl
+ - benchmarks/fixtures/sessions/session-a.jsonl
+ - benchmarks/fixtures/sessions/session-b.jsonl
+ - benchmarks/fixtures/sessions/session-c.jsonl
+ - benchmarks/fixtures/sessions/long-session.jsonl
+autonomous: true
+requirements:
+ - BENCH-01
+ - BENCH-06
+
+must_haves:
+ truths:
+ - "memory-bench crate compiles as part of the workspace"
+ - "TOML fixture files parse into typed Rust structs"
+ - "Fixture loader validates required fields and rejects invalid fixtures"
+ - "locomo-data/ is gitignored"
+ - "Baseline competitor scores are stored in benchmarks/baselines.toml"
+ artifacts:
+ - path: "crates/memory-bench/Cargo.toml"
+ provides: "Crate manifest with workspace deps"
+ contains: "memory-bench"
+ - path: "crates/memory-bench/src/fixture.rs"
+ provides: "TOML fixture loader with validation"
+ exports: ["Fixture", "TestCase"]
+ - path: "benchmarks/fixtures/temporal-001.toml"
+ provides: "Temporal recall test fixtures"
+ contains: "[[test]]"
+ - path: "benchmarks/baselines.toml"
+ provides: "Competitor baseline scores"
+ contains: "[memmachine]"
+ key_links:
+ - from: "crates/memory-bench/src/fixture.rs"
+ to: "benchmarks/fixtures/*.toml"
+ via: "Fixture::load and Fixture::load_dir"
+ pattern: "toml::from_str"
+ - from: "Cargo.toml"
+ to: "crates/memory-bench"
+ via: "workspace members"
+ pattern: "crates/memory-bench"
+---
+
+
+Scaffold the memory-bench crate, define the TOML fixture format with loader, and create all benchmark data files.
+
+Purpose: Establish the crate structure and fixture format that all subsequent benchmark tasks build on. The fixture loader is the foundation — runner, scorer, and CLI all consume its types.
+Output: Compilable memory-bench crate with fixture loader, 3 fixture files, session stubs, baselines, and download script.
+
+
+
+@/Users/richardhightower/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/richardhightower/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md
+
+
+
+
+
+From Cargo.toml (workspace root):
+```toml
+[workspace]
+resolver = "2"
+members = [
+ "crates/e2e-tests",
+ # ... other crates ...
+ "crates/memory-cli",
+]
+
+[workspace.package]
+version = "2.7.0"
+edition = "2021"
+license = "MIT"
+```
+
+
+
+
+
+
+ Task 1: Scaffold memory-bench crate and benchmark data files
+
+ Cargo.toml,
+ .gitignore,
+ crates/memory-bench/Cargo.toml,
+ crates/memory-bench/src/main.rs,
+ crates/memory-bench/src/lib.rs,
+ benchmarks/baselines.toml,
+ benchmarks/scripts/download-locomo.sh,
+ benchmarks/fixtures/sessions/auth-decision.jsonl,
+ benchmarks/fixtures/sessions/bug-fix.jsonl,
+ benchmarks/fixtures/sessions/follow-up.jsonl,
+ benchmarks/fixtures/sessions/session-a.jsonl,
+ benchmarks/fixtures/sessions/session-b.jsonl,
+ benchmarks/fixtures/sessions/session-c.jsonl,
+ benchmarks/fixtures/sessions/long-session.jsonl
+
+
+ Cargo.toml (workspace root — to see members array and workspace.dependencies),
+ .gitignore (to append locomo-data/ entry),
+ docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md (Task 1 section)
+
+
+1. Create `crates/memory-bench/Cargo.toml`:
+```toml
+[package]
+name = "memory-bench"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "memory-bench"
+path = "src/main.rs"
+
+[dependencies]
+clap = { workspace = true }
+tokio = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+toml = { workspace = true }
+thiserror = { workspace = true }
+anyhow = { workspace = true }
+tracing = { workspace = true }
+
+[dev-dependencies]
+tempfile = { workspace = true }
+```
+
+2. Create `crates/memory-bench/src/lib.rs` — declare modules:
+```rust
+pub mod fixture;
+```
+(More modules added by Plan 02 and 03)
+
+3. Create `crates/memory-bench/src/main.rs` — minimal binary that compiles:
+```rust
+fn main() {
+ println!("memory-bench: use --help for subcommands");
+}
+```
+
+4. Add `"crates/memory-bench"` to workspace `members` in root `Cargo.toml`
+
+5. Append to `.gitignore`:
+```
+# LOCOMO benchmark dataset — download separately via benchmarks/scripts/download-locomo.sh
+locomo-data/
+```
+
+6. Create `benchmarks/baselines.toml` with manually-entered competitor scores:
+```toml
+# Manually-maintained competitor benchmark scores.
+# Sources listed per entry.
+
+[memmachine]
+# Source: https://memmachine.ai/blog/2025/12/memmachine-v0.2-delivers-top-scores-and-efficiency-on-locomo-benchmark/
+locomo_score = 0.91
+token_reduction = 0.80
+latency_improvement = 0.75
+
+[mem0]
+# Source: https://mem0.ai/research
+accuracy_vs_openai_memory = 0.26
+token_reduction = 0.90
+latency_reduction = 0.91
+```
+
+7. Create `benchmarks/scripts/download-locomo.sh` (chmod +x):
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+DEST="${1:-locomo-data}"
+mkdir -p "$DEST"
+echo "Downloading LOCOMO dataset to $DEST ..."
+curl -L "https://snap-research.github.io/locomo/data/locomo_v1.zip" -o "$DEST/locomo_v1.zip"
+unzip -q "$DEST/locomo_v1.zip" -d "$DEST"
+echo "Done. Dataset at: $DEST"
+echo "NOTE: Verify license terms at https://snap-research.github.io/locomo/ before publishing scores."
+```
+
+8. Create stub JSONL session files in `benchmarks/fixtures/sessions/`:
+- `auth-decision.jsonl` — 4-6 turns about choosing JWT auth
+- `bug-fix.jsonl` — 4-6 turns about fixing a null pointer bug with Option
+- `follow-up.jsonl` — 2-3 turns following up on the bug fix
+- `session-a.jsonl` — 4 turns deciding on an approach
+- `session-b.jsonl` — 4 turns with outcome discussion (performance/latency)
+- `session-c.jsonl` — 2 turns connecting decisions to outcomes
+- `long-session.jsonl` — 15-20 turns covering multiple architecture decisions (for compression tests)
+
+Each line is `{"role":"user"|"assistant","content":"..."}` format.
+
+9. Verify: `cargo build -p memory-bench` compiles successfully.
+
+
+ - Test: workspace builds with memory-bench included
+ - Test: binary runs without error
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo build -p memory-bench 2>&1 | tail -5
+
+
+ - grep -q "memory-bench" Cargo.toml (workspace members)
+ - grep -q "locomo-data" .gitignore
+ - test -f crates/memory-bench/Cargo.toml
+ - test -f crates/memory-bench/src/main.rs
+ - test -f benchmarks/baselines.toml
+ - test -x benchmarks/scripts/download-locomo.sh
+ - test -f benchmarks/fixtures/sessions/auth-decision.jsonl
+ - cargo build -p memory-bench succeeds
+
+ memory-bench crate compiles, all data files exist, locomo-data is gitignored
+
+
+
+ Task 2: Implement fixture format and TOML loader with tests
+
+ crates/memory-bench/src/fixture.rs,
+ crates/memory-bench/src/lib.rs,
+ benchmarks/fixtures/temporal-001.toml,
+ benchmarks/fixtures/multisession-001.toml,
+ benchmarks/fixtures/compression-001.toml
+
+
+ docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md (Task 2 section — fixture.rs code, fixture TOML examples),
+ crates/memory-bench/src/lib.rs
+
+
+1. Create `crates/memory-bench/src/fixture.rs` with types and loader:
+```rust
+use anyhow::{bail, Result};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+#[derive(Debug, Deserialize, Serialize)]
+pub struct Fixture {
+ #[serde(rename = "test")]
+ pub tests: Vec,
+}
+
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct TestCase {
+ pub id: String,
+ pub description: String,
+ pub setup: Vec, // Paths to JSONL session files
+ pub query: String,
+ pub expected_contains: Vec, // Case-insensitive substring match
+ pub max_tokens: usize,
+}
+
+impl Fixture {
+ pub fn load(path: &Path) -> Result { /* validate id non-empty, query non-empty */ }
+ pub fn load_dir(dir: &Path) -> Result> { /* read all .toml files in dir */ }
+}
+```
+
+2. Add tests in fixture.rs:
+- `test_fixture_parses_valid_toml` — parse a valid fixture, assert tests.len() and id
+- `test_fixture_validates_required_fields` — empty id or missing query returns Err
+- `test_load_dir_collects_all_fixtures` — create temp dir with 2 TOML files, verify combined count
+
+3. Create `benchmarks/fixtures/temporal-001.toml`:
+```toml
+[[test]]
+id = "temporal-001"
+description = "Recall an architectural decision made in a prior session"
+setup = ["sessions/auth-decision.jsonl"]
+query = "what authentication approach did we decide on?"
+expected_contains = ["JWT", "token"]
+max_tokens = 500
+
+[[test]]
+id = "temporal-002"
+description = "Recall a specific bug fix from two sessions ago"
+setup = ["sessions/bug-fix.jsonl", "sessions/follow-up.jsonl"]
+query = "how did we fix the null pointer exception?"
+expected_contains = ["null check", "Option"]
+max_tokens = 400
+```
+
+4. Create `benchmarks/fixtures/multisession-001.toml`:
+```toml
+[[test]]
+id = "multi-001"
+description = "Connect a decision from session A with an outcome from session B"
+setup = ["sessions/session-a.jsonl", "sessions/session-b.jsonl", "sessions/session-c.jsonl"]
+query = "what was the outcome of the approach we chose last week?"
+expected_contains = ["performance", "latency"]
+max_tokens = 600
+```
+
+5. Create `benchmarks/fixtures/compression-001.toml`:
+```toml
+[[test]]
+id = "compress-001"
+description = "Verify context is compressed vs raw session dump"
+setup = ["sessions/long-session.jsonl"]
+query = "summarize the key decisions from this project"
+expected_contains = ["decision", "architecture"]
+max_tokens = 800
+```
+
+6. Ensure `pub mod fixture;` is in lib.rs.
+
+7. Run: `cargo test -p memory-bench fixture` — all tests pass.
+
+
+ - Test: valid TOML fixture parses into Fixture with correct test count and ids
+ - Test: fixture with empty id returns error
+ - Test: fixture with missing query returns error
+ - Test: load_dir reads multiple .toml files and collects all TestCase entries
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo test -p memory-bench fixture 2>&1 | tail -10
+
+
+ - grep -q "pub struct Fixture" crates/memory-bench/src/fixture.rs
+ - grep -q "pub struct TestCase" crates/memory-bench/src/fixture.rs
+ - grep -q "pub fn load" crates/memory-bench/src/fixture.rs
+ - grep -q "pub fn load_dir" crates/memory-bench/src/fixture.rs
+ - test -f benchmarks/fixtures/temporal-001.toml
+ - test -f benchmarks/fixtures/multisession-001.toml
+ - test -f benchmarks/fixtures/compression-001.toml
+ - grep -q "temporal-001" benchmarks/fixtures/temporal-001.toml
+ - cargo test -p memory-bench fixture passes (3+ tests)
+
+ Fixture loader parses TOML, validates required fields, load_dir collects all fixtures from a directory, 3 fixture TOML files exist with realistic test cases
+
+
+
+
+
+- `cargo build -p memory-bench` compiles without errors
+- `cargo test -p memory-bench` passes all fixture tests
+- `cargo clippy -p memory-bench -- -D warnings` produces no warnings
+- All benchmark data files exist in `benchmarks/` directory
+- `locomo-data/` appears in `.gitignore`
+
+
+
+- memory-bench crate is a workspace member and compiles
+- Fixture type system (Fixture, TestCase) is defined and exported
+- 3 TOML fixture files with realistic test cases exist
+- 7 stub JSONL session files exist for ingestion tests
+- baselines.toml has MemMachine and Mem0 scores
+- download-locomo.sh is executable
+- locomo-data/ is gitignored
+- All fixture loader tests pass
+
+
+
diff --git a/.planning/phases/53-benchmark-suite/53-01-SUMMARY.md b/.planning/phases/53-benchmark-suite/53-01-SUMMARY.md
new file mode 100644
index 0000000..b34c455
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-01-SUMMARY.md
@@ -0,0 +1,123 @@
+---
+phase: 53-benchmark-suite
+plan: 01
+subsystem: testing
+tags: [benchmark, toml, fixtures, serde, memory-bench]
+
+requires:
+ - phase: 52-simple-cli-api
+ provides: "memory CLI binary for benchmark queries"
+provides:
+ - "memory-bench crate with fixture loader"
+ - "TOML fixture format (Fixture, TestCase types)"
+ - "3 benchmark fixture files (temporal, multisession, compression)"
+ - "7 stub JSONL session files"
+ - "Competitor baselines in benchmarks/baselines.toml"
+ - "LOCOMO download script"
+affects: [53-02, 53-03]
+
+tech-stack:
+ added: [memory-bench crate]
+ patterns: [TOML fixture loading with validation, JSONL session stubs]
+
+key-files:
+ created:
+ - crates/memory-bench/Cargo.toml
+ - crates/memory-bench/src/lib.rs
+ - crates/memory-bench/src/main.rs
+ - crates/memory-bench/src/fixture.rs
+ - benchmarks/baselines.toml
+ - benchmarks/scripts/download-locomo.sh
+ - benchmarks/fixtures/temporal-001.toml
+ - benchmarks/fixtures/multisession-001.toml
+ - benchmarks/fixtures/compression-001.toml
+ - benchmarks/fixtures/sessions/auth-decision.jsonl
+ - benchmarks/fixtures/sessions/bug-fix.jsonl
+ - benchmarks/fixtures/sessions/follow-up.jsonl
+ - benchmarks/fixtures/sessions/session-a.jsonl
+ - benchmarks/fixtures/sessions/session-b.jsonl
+ - benchmarks/fixtures/sessions/session-c.jsonl
+ - benchmarks/fixtures/sessions/long-session.jsonl
+ modified:
+ - Cargo.toml
+ - .gitignore
+
+key-decisions:
+ - "TOML fixture format with [[test]] arrays for multi-case files"
+ - "Fixture::load validates id and query non-empty at parse time"
+ - "Fixture::load_dir sorts entries for deterministic ordering"
+
+patterns-established:
+ - "Fixture TOML: [[test]] with id, description, setup, query, expected_contains, max_tokens"
+ - "Session stubs: JSONL with {role, content} objects per line"
+
+requirements-completed: [BENCH-01, BENCH-06]
+
+duration: 4min
+completed: 2026-03-23
+---
+
+# Phase 53 Plan 01: Benchmark Suite Foundation Summary
+
+**memory-bench crate with TOML fixture loader, 3 benchmark fixtures, 7 session stubs, and competitor baselines**
+
+## Performance
+
+- **Duration:** 4 min
+- **Started:** 2026-03-23T02:13:14Z
+- **Completed:** 2026-03-23T02:17:00Z
+- **Tasks:** 2
+- **Files modified:** 18
+
+## Accomplishments
+- Scaffolded memory-bench crate as workspace member with all required dependencies
+- Implemented Fixture and TestCase types with TOML deserialization and validation
+- Created 3 fixture files covering temporal recall, multisession reasoning, and compression
+- Created 7 realistic JSONL session stubs for benchmark ingestion
+- Added competitor baselines (MemMachine, Mem0) and LOCOMO download script
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Scaffold memory-bench crate and benchmark data files** - `5d7cd54` (feat)
+2. **Task 2: Implement fixture format and TOML loader with tests** - `4355227` (feat)
+
+## Files Created/Modified
+- `crates/memory-bench/Cargo.toml` - Crate manifest with workspace deps
+- `crates/memory-bench/src/main.rs` - Minimal binary entrypoint
+- `crates/memory-bench/src/lib.rs` - Module declarations
+- `crates/memory-bench/src/fixture.rs` - TOML fixture loader with Fixture/TestCase types and 4 tests
+- `benchmarks/baselines.toml` - MemMachine and Mem0 competitor scores
+- `benchmarks/scripts/download-locomo.sh` - LOCOMO dataset download script
+- `benchmarks/fixtures/temporal-001.toml` - Temporal recall test cases
+- `benchmarks/fixtures/multisession-001.toml` - Multi-session reasoning test case
+- `benchmarks/fixtures/compression-001.toml` - Compression efficiency test case
+- `benchmarks/fixtures/sessions/*.jsonl` - 7 session stub files
+- `Cargo.toml` - Added memory-bench to workspace members
+- `.gitignore` - Added locomo-data/ exclusion
+
+## Decisions Made
+- TOML fixture format uses `[[test]]` arrays allowing multiple test cases per file
+- Fixture::load validates id and query non-empty at parse time (fail-fast)
+- Fixture::load_dir sorts directory entries for deterministic test ordering
+- Session stubs use realistic multi-turn conversations about auth, bug fixes, and caching
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+None
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Fixture loader types (Fixture, TestCase) ready for runner and scorer in Plan 02
+- Session JSONL stubs ready for ingestion testing
+- Baselines ready for comparison report generation in Plan 03
+
+---
+*Phase: 53-benchmark-suite*
+*Completed: 2026-03-23*
diff --git a/.planning/phases/53-benchmark-suite/53-02-PLAN.md b/.planning/phases/53-benchmark-suite/53-02-PLAN.md
new file mode 100644
index 0000000..9206b7f
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-02-PLAN.md
@@ -0,0 +1,529 @@
+---
+phase: 53-benchmark-suite
+plan: 02
+type: execute
+wave: 2
+depends_on: ["53-01"]
+files_modified:
+ - crates/memory-bench/src/runner.rs
+ - crates/memory-bench/src/scorer.rs
+ - crates/memory-bench/src/report.rs
+ - crates/memory-bench/src/baseline.rs
+ - crates/memory-bench/src/cli.rs
+ - crates/memory-bench/src/main.rs
+ - crates/memory-bench/src/lib.rs
+autonomous: true
+requirements:
+ - BENCH-02
+ - BENCH-03
+ - BENCH-05
+ - BENCH-08
+
+must_haves:
+ truths:
+ - "Runner shells out to memory binary and captures JSON output + latency"
+ - "Scorer computes accuracy and recall@5 from expected_contains matching"
+ - "Scorer computes compression_ratio from context_tokens vs raw_tokens"
+ - "Report generates both JSON and markdown table formats"
+ - "Baseline loader reads benchmarks/baselines.toml into typed structs"
+ - "CLI exposes temporal, multisession, compression, all, and locomo subcommands"
+ - "--compare flag reads baselines and prints side-by-side table"
+ artifacts:
+ - path: "crates/memory-bench/src/runner.rs"
+ provides: "Benchmark runner that shells out to memory binary"
+ exports: ["QueryResult", "run_query"]
+ - path: "crates/memory-bench/src/scorer.rs"
+ provides: "Result scoring: accuracy, recall@k, latency percentiles, compression ratio"
+ exports: ["BenchmarkReport", "score_result", "compute_accuracy", "compute_compression_ratio"]
+ - path: "crates/memory-bench/src/report.rs"
+ provides: "JSON + markdown report generation"
+ exports: ["to_markdown", "to_json"]
+ - path: "crates/memory-bench/src/baseline.rs"
+ provides: "Competitor baseline TOML loader"
+ exports: ["Baselines", "CompetitorScore"]
+ - path: "crates/memory-bench/src/cli.rs"
+ provides: "Clap CLI definition with all subcommands"
+ exports: ["Cli", "Commands"]
+ key_links:
+ - from: "crates/memory-bench/src/runner.rs"
+ to: "memory binary"
+ via: "std::process::Command"
+ pattern: "Command::new.*memory.*search"
+ - from: "crates/memory-bench/src/scorer.rs"
+ to: "crates/memory-bench/src/runner.rs"
+ via: "scores QueryResult against TestCase expected_contains"
+ pattern: "score_result"
+ - from: "crates/memory-bench/src/scorer.rs"
+ to: "benchmarks/fixtures/*.jsonl"
+ via: "compute_compression_ratio counts raw_tokens from JSONL setup file chars"
+ pattern: "compute_compression_ratio"
+ - from: "crates/memory-bench/src/report.rs"
+ to: "crates/memory-bench/src/scorer.rs"
+ via: "formats BenchmarkReport into JSON/markdown"
+ pattern: "BenchmarkReport"
+ - from: "crates/memory-bench/src/main.rs"
+ to: "crates/memory-bench/src/cli.rs"
+ via: "Clap Parser dispatch"
+ pattern: "Commands::"
+---
+
+
+Implement the benchmark runner, scorer, report generator, baseline comparison, and CLI wiring for the custom harness.
+
+Purpose: This is the core benchmark engine. The runner executes queries against the memory binary, the scorer computes metrics, and the report generator produces publishable JSON/markdown output. The CLI wires everything into subcommands.
+Output: Working `memory-bench` binary with temporal/multisession/compression/all subcommands, --compare flag, and JSON+markdown output.
+
+
+
+@/Users/richardhightower/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/richardhightower/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/53-benchmark-suite/53-01-SUMMARY.md
+@docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md
+
+
+
+From crates/memory-bench/src/fixture.rs:
+```rust
+pub struct Fixture {
+ pub tests: Vec,
+}
+pub struct TestCase {
+ pub id: String,
+ pub description: String,
+ pub setup: Vec,
+ pub query: String,
+ pub expected_contains: Vec,
+ pub max_tokens: usize,
+}
+impl Fixture {
+ pub fn load(path: &Path) -> Result;
+ pub fn load_dir(dir: &Path) -> Result>;
+}
+```
+
+
+From crates/memory-cli/src/output.rs:
+```rust
+pub struct JsonEnvelope {
+ pub status: String, // "ok" or "error"
+ pub query: Option,
+ pub results: serde_json::Value,
+ pub meta: serde_json::Value, // includes tokens_estimated
+}
+```
+The runner calls `memory search "query" --format=json` and parses the JSON envelope to extract results and meta.tokens_estimated.
+
+
+
+
+
+
+ Task 1: Implement runner, scorer, and report modules
+
+ crates/memory-bench/src/runner.rs,
+ crates/memory-bench/src/scorer.rs,
+ crates/memory-bench/src/report.rs,
+ crates/memory-bench/src/baseline.rs,
+ crates/memory-bench/src/lib.rs
+
+
+ docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md (Tasks 3 and 4 — runner.rs, scorer.rs, report.rs, baseline.rs code),
+ crates/memory-bench/src/lib.rs,
+ crates/memory-bench/src/fixture.rs (to understand TestCase struct),
+ benchmarks/baselines.toml (to understand the format baseline.rs loads)
+
+
+1. Create `crates/memory-bench/src/scorer.rs`:
+```rust
+/// Returns true if result text contains at least one expected string (case-insensitive).
+pub fn score_result(result: &str, expected_contains: &[String]) -> bool {
+ let lower = result.to_lowercase();
+ expected_contains.iter().any(|e| lower.contains(&e.to_lowercase()))
+}
+
+pub fn compute_accuracy(hits: &[bool]) -> f64 {
+ if hits.is_empty() { return 0.0; }
+ hits.iter().filter(|&&h| h).count() as f64 / hits.len() as f64
+}
+
+pub fn compute_recall_at_k(hits_in_top_k: &[bool], total_relevant: usize) -> f64 {
+ if total_relevant == 0 { return 0.0; }
+ hits_in_top_k.iter().filter(|&&h| h).count() as f64 / total_relevant as f64
+}
+
+pub fn percentile(sorted_values: &[u64], p: f64) -> u64 {
+ if sorted_values.is_empty() { return 0; }
+ let idx = ((p / 100.0) * (sorted_values.len() as f64 - 1.0)).round() as usize;
+ sorted_values[idx.min(sorted_values.len() - 1)]
+}
+
+/// Compute compression ratio: how much smaller the context_tokens are compared to raw input.
+///
+/// Formula: `1.0 - (context_tokens as f64 / raw_tokens as f64)`
+///
+/// - `context_tokens`: tokens_estimated returned by the memory search JSON envelope
+/// (meta.tokens_estimated from the runner's QueryResult).
+/// - `raw_tokens`: derived by counting total characters across all JSONL setup lines for the
+/// test case (each line in TestCase.setup is a raw conversation turn). Divide char count by
+/// 4.0 as a standard chars-per-token approximation.
+///
+/// Returns 0.0 if raw_tokens is 0 (prevents divide-by-zero).
+pub fn compute_compression_ratio(context_tokens: usize, raw_tokens: usize) -> f64 {
+ if raw_tokens == 0 { return 0.0; }
+ 1.0 - (context_tokens as f64 / raw_tokens as f64)
+}
+
+/// Estimate raw token count from JSONL setup strings (TestCase.setup lines).
+/// Sums character lengths of all setup strings and divides by 4 (chars-per-token approximation).
+pub fn estimate_raw_tokens(setup_lines: &[String]) -> usize {
+ let total_chars: usize = setup_lines.iter().map(|s| s.len()).sum();
+ (total_chars as f64 / 4.0).ceil() as usize
+}
+
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct BenchmarkReport {
+ pub accuracy: f64,
+ pub recall_at_5: f64,
+ pub token_usage_avg: usize,
+ pub latency_p50_ms: u64,
+ pub latency_p95_ms: u64,
+ pub compression_ratio: f64,
+ pub test_count: usize,
+ pub pass_count: usize,
+}
+```
+
+Tests for scorer:
+- `test_score_hit_when_expected_present` — "JWT" in "We chose JWT for stateless auth" = true
+- `test_score_miss_when_none_present` — "JWT" NOT in "We chose sessions with cookies" = false
+- `test_score_case_insensitive` — "jwt" matches "JWT" = true
+- `test_accuracy_all_hits` — [true, true, true] = 1.0
+- `test_accuracy_partial` — [true, false, true] = 0.667
+- `test_accuracy_empty` — [] = 0.0
+- `test_percentile_p50` — [10, 20, 30, 40, 50] at p50 = 30
+- `test_percentile_p95` — [10, 20, 30, 40, 50] at p95 = 50
+- `test_compression_ratio_typical` — compute_compression_ratio(250, 1000) = 0.75
+- `test_compression_ratio_zero_raw` — compute_compression_ratio(100, 0) = 0.0
+- `test_estimate_raw_tokens` — estimate_raw_tokens(&["hello world".to_string()]) = ceil(11/4) = 3
+
+2. Create `crates/memory-bench/src/runner.rs`:
+```rust
+use std::process::Command;
+use std::time::Instant;
+
+pub struct RunConfig {
+ pub memory_bin: String, // path to memory binary, default "memory"
+}
+
+impl Default for RunConfig {
+ fn default() -> Self { Self { memory_bin: "memory".to_string() } }
+}
+
+pub struct QueryResult {
+ pub raw_output: String,
+ pub latency_ms: u64,
+ pub tokens_estimated: usize,
+ pub success: bool,
+}
+
+pub fn run_query(query: &str, config: &RunConfig) -> QueryResult {
+ let start = Instant::now();
+ let output = Command::new(&config.memory_bin)
+ .args(["search", query, "--format=json"])
+ .output();
+ // Handle Command failure gracefully (binary not found = success: false)
+ // Parse JSON envelope for meta.tokens_estimated
+ // Return QueryResult with latency_ms from elapsed time
+}
+
+/// Ingest a JSONL session file by calling `memory add` for each line.
+pub fn ingest_session(session_path: &str, config: &RunConfig) -> anyhow::Result<()> {
+ // Read JSONL, for each line call: memory add --content "..." --kind episodic
+ // Skip lines that fail to parse
+}
+```
+
+Note: runner.rs does NOT need unit tests (it shells out to external binary). Integration tested at CLI level.
+
+3. Create `crates/memory-bench/src/baseline.rs`:
+```rust
+use anyhow::Result;
+use serde::Deserialize;
+use std::path::Path;
+
+#[derive(Debug, Deserialize)]
+pub struct Baselines {
+ pub memmachine: Option,
+ pub mem0: Option,
+}
+
+#[derive(Debug, Deserialize)]
+pub struct CompetitorScore {
+ pub locomo_score: Option,
+ pub token_reduction: Option,
+ pub latency_improvement: Option,
+ pub accuracy_vs_openai_memory: Option,
+ pub latency_reduction: Option,
+}
+
+impl Baselines {
+ pub fn load(path: &Path) -> Result {
+ let content = std::fs::read_to_string(path)?;
+ Ok(toml::from_str(&content)?)
+ }
+}
+```
+
+Test for baseline:
+- `test_baselines_load` — parse the actual benchmarks/baselines.toml content, assert memmachine.locomo_score = Some(0.91)
+
+4. Create `crates/memory-bench/src/report.rs`:
+```rust
+use crate::scorer::BenchmarkReport;
+use crate::baseline::Baselines;
+
+pub fn to_json(report: &BenchmarkReport) -> String {
+ serde_json::to_string_pretty(report).unwrap_or_default()
+}
+
+pub fn to_markdown(report: &BenchmarkReport, baselines: Option<&Baselines>) -> String {
+ // Generate markdown table with columns: Metric | Agent-Memory | MemMachine | Mem0
+ // Include: accuracy, recall@5, avg tokens, latency p50, latency p95, compression ratio
+ // If baselines is None, single-column table
+}
+```
+
+Test for report:
+- `test_to_json_roundtrips` — serialize + deserialize BenchmarkReport matches original
+- `test_to_markdown_contains_headers` — output contains "Accuracy" and "Recall@5"
+
+5. Update `crates/memory-bench/src/lib.rs` to declare all modules:
+```rust
+pub mod fixture;
+pub mod runner;
+pub mod scorer;
+pub mod report;
+pub mod baseline;
+```
+
+6. Run: `cargo test -p memory-bench` — all scorer, baseline, and report tests pass.
+7. Run: `cargo clippy -p memory-bench -- -D warnings` — no warnings.
+
+
+ - Test: score_result returns true when expected substring present (case-insensitive)
+ - Test: score_result returns false when no expected substring matches
+ - Test: compute_accuracy returns correct ratio for all-hit, partial-hit, empty cases
+ - Test: percentile returns correct p50 and p95 values
+ - Test: compute_compression_ratio returns 1.0 - (context/raw), 0.0 when raw_tokens is 0
+ - Test: estimate_raw_tokens returns ceil(total_chars / 4)
+ - Test: baselines TOML parses into typed struct
+ - Test: to_json round-trips through serde
+ - Test: to_markdown contains metric headers
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo test -p memory-bench 2>&1 | tail -15
+
+
+ - grep -q "pub fn score_result" crates/memory-bench/src/scorer.rs
+ - grep -q "pub fn compute_accuracy" crates/memory-bench/src/scorer.rs
+ - grep -q "pub fn compute_compression_ratio" crates/memory-bench/src/scorer.rs
+ - grep -q "pub fn estimate_raw_tokens" crates/memory-bench/src/scorer.rs
+ - grep -q "pub struct BenchmarkReport" crates/memory-bench/src/scorer.rs
+ - grep -q "pub fn run_query" crates/memory-bench/src/runner.rs
+ - grep -q "pub struct Baselines" crates/memory-bench/src/baseline.rs
+ - grep -q "pub fn to_json" crates/memory-bench/src/report.rs
+ - grep -q "pub fn to_markdown" crates/memory-bench/src/report.rs
+ - cargo test -p memory-bench passes (13+ tests including scorer, baseline, report)
+ - cargo clippy -p memory-bench -- -D warnings passes
+
+ Runner, scorer, report, and baseline modules exist with passing tests. Scorer computes accuracy/recall/percentiles/compression_ratio. Report generates JSON and markdown. Baseline loads competitor TOML.
+
+
+
+ Task 2: Wire CLI subcommands and run pipeline end-to-end
+
+ crates/memory-bench/src/cli.rs,
+ crates/memory-bench/src/main.rs
+
+
+ docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md (Task 5 — CLI wiring code),
+ crates/memory-bench/src/lib.rs,
+ crates/memory-bench/src/runner.rs,
+ crates/memory-bench/src/scorer.rs,
+ crates/memory-bench/src/report.rs,
+ crates/memory-bench/src/fixture.rs
+
+
+1. Create `crates/memory-bench/src/cli.rs`:
+```rust
+use clap::{Parser, Subcommand};
+
+#[derive(Parser)]
+#[command(name = "memory-bench", about = "Benchmark suite for Agent Memory")]
+pub struct Cli {
+ #[command(subcommand)]
+ pub command: Commands,
+
+ /// Path to memory binary (default: searches PATH)
+ #[arg(long, global = true, default_value = "memory")]
+ pub memory_bin: String,
+}
+
+#[derive(Subcommand)]
+pub enum Commands {
+ /// Run temporal recall benchmarks
+ Temporal {
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run multi-session reasoning benchmarks
+ Multisession {
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run compression efficiency benchmarks
+ Compression {
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run full custom benchmark suite
+ All {
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ #[arg(long)]
+ output: Option,
+ /// Compare against competitor baselines
+ #[arg(long)]
+ compare: bool,
+ /// Path to baselines TOML
+ #[arg(long, default_value = "benchmarks/baselines.toml")]
+ baselines: String,
+ },
+ /// Run LOCOMO adapter benchmark
+ Locomo {
+ /// Path to LOCOMO dataset directory
+ #[arg(long)]
+ dataset: String,
+ #[arg(long)]
+ output: Option,
+ /// Compare against competitor baselines
+ #[arg(long)]
+ compare: bool,
+ #[arg(long, default_value = "benchmarks/baselines.toml")]
+ baselines: String,
+ },
+}
+```
+
+2. Update `crates/memory-bench/src/main.rs` to wire the full pipeline:
+```rust
+use clap::Parser;
+mod cli;
+use memory_bench::{fixture, runner, scorer, report, baseline};
+
+fn main() -> anyhow::Result<()> {
+ let cli = cli::Cli::parse();
+ let config = runner::RunConfig { memory_bin: cli.memory_bin.clone() };
+
+ match cli.command {
+ cli::Commands::Temporal { fixtures, output } => {
+ run_category("temporal", &fixtures, &config, output.as_deref())?;
+ }
+ cli::Commands::Multisession { fixtures, output } => {
+ run_category("multi", &fixtures, &config, output.as_deref())?;
+ }
+ cli::Commands::Compression { fixtures, output } => {
+ run_category("compress", &fixtures, &config, output.as_deref())?;
+ }
+ cli::Commands::All { fixtures, output, compare, baselines } => {
+ let report = run_all(&fixtures, &config)?;
+ let baselines_data = if compare {
+ Some(baseline::Baselines::load(std::path::Path::new(&baselines))?)
+ } else { None };
+ let json = report::to_json(&report);
+ let md = report::to_markdown(&report, baselines_data.as_ref());
+ println!("{md}");
+ if let Some(path) = output {
+ std::fs::write(&path, &json)?;
+ eprintln!("Results written to {path}");
+ }
+ }
+ cli::Commands::Locomo { .. } => {
+ eprintln!("LOCOMO adapter — use memory-bench locomo --dataset=./locomo-data/");
+ // Delegated to locomo module (Plan 03)
+ }
+ }
+ Ok(())
+}
+```
+
+The `run_category` function:
+- Loads fixtures from the directory, filters by category prefix (test.id starts with category prefix)
+- For each test case: ingest sessions via runner::ingest_session, run query via runner::run_query
+- Score results via scorer::score_result
+- Compute compression_ratio using scorer::compute_compression_ratio(result.tokens_estimated, scorer::estimate_raw_tokens(&test.setup))
+- Collect latencies, compute BenchmarkReport
+- Print markdown report
+
+The `run_all` function calls run_category for each category and aggregates into one BenchmarkReport.
+
+3. Update lib.rs to add `pub mod cli;` — NO, cli.rs is in main.rs scope (binary-only). Keep lib.rs as-is.
+
+4. Verify: `cargo run -p memory-bench -- --help` shows all subcommands.
+5. Verify: `cargo run -p memory-bench -- all --help` shows --fixtures, --output, --compare, --baselines flags.
+6. Run: `cargo clippy -p memory-bench -- -D warnings` — no warnings.
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo run -p memory-bench -- --help 2>&1 | head -20
+
+
+ - grep -q "pub struct Cli" crates/memory-bench/src/cli.rs
+ - grep -q "Temporal" crates/memory-bench/src/cli.rs
+ - grep -q "Multisession" crates/memory-bench/src/cli.rs
+ - grep -q "Compression" crates/memory-bench/src/cli.rs
+ - grep -q "Locomo" crates/memory-bench/src/cli.rs
+ - grep -q "compare" crates/memory-bench/src/cli.rs
+ - grep -q "memory_bin" crates/memory-bench/src/cli.rs
+ - cargo run -p memory-bench -- --help shows temporal, multisession, compression, all, locomo
+ - cargo clippy -p memory-bench -- -D warnings passes
+
+ memory-bench binary shows all 5 subcommands in --help. The `all` subcommand loads fixtures, runs queries, scores results, and outputs JSON+markdown report. --compare reads baselines.toml for side-by-side table.
+
+
+
+
+
+- `cargo build -p memory-bench` compiles
+- `cargo test -p memory-bench` passes all tests (scorer, baseline, report, fixture)
+- `cargo run -p memory-bench -- --help` shows 5 subcommands
+- `cargo run -p memory-bench -- all --help` shows --compare and --output flags
+- `cargo clippy -p memory-bench -- -D warnings` passes
+
+
+
+- Runner shells out to memory binary via std::process::Command
+- Scorer computes accuracy, recall@5, latency p50/p95, compression ratio with passing unit tests
+- compression_ratio = 1.0 - (context_tokens / raw_tokens), raw_tokens estimated from JSONL setup chars / 4
+- Report module generates both JSON and markdown formats
+- Baseline module loads benchmarks/baselines.toml
+- CLI exposes temporal, multisession, compression, all, locomo subcommands
+- --compare flag triggers baseline comparison in output
+- --memory-bin global flag allows overriding binary path
+- All tests pass, clippy clean
+
+
+
diff --git a/.planning/phases/53-benchmark-suite/53-02-SUMMARY.md b/.planning/phases/53-benchmark-suite/53-02-SUMMARY.md
new file mode 100644
index 0000000..b8d3a97
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-02-SUMMARY.md
@@ -0,0 +1,111 @@
+---
+phase: 53-benchmark-suite
+plan: 02
+subsystem: testing
+tags: [benchmark, scorer, runner, clap, serde, toml, markdown]
+
+requires:
+ - phase: 53-benchmark-suite plan 01
+ provides: "Fixture loader, TOML format, test case structs"
+provides:
+ - "Benchmark runner shelling out to memory binary"
+ - "Scorer: accuracy, recall@k, percentile, compression ratio"
+ - "Report generator: JSON + markdown with optional baseline comparison"
+ - "Baseline TOML loader for competitor scores"
+ - "CLI with temporal, multisession, compression, all, locomo subcommands"
+affects: [53-benchmark-suite plan 03, benchmark-integration]
+
+tech-stack:
+ added: []
+ patterns: ["shell-out runner pattern via std::process::Command", "BenchmarkReport as shared metric struct"]
+
+key-files:
+ created:
+ - crates/memory-bench/src/scorer.rs
+ - crates/memory-bench/src/runner.rs
+ - crates/memory-bench/src/report.rs
+ - crates/memory-bench/src/baseline.rs
+ - crates/memory-bench/src/cli.rs
+ modified:
+ - crates/memory-bench/src/lib.rs
+ - crates/memory-bench/src/main.rs
+
+key-decisions:
+ - "Runner shells out via std::process::Command (no in-process coupling)"
+ - "Compression ratio: 1.0 - (context_tokens / raw_tokens), raw_tokens from chars/4"
+ - "CLI uses clap subcommands with global --memory-bin flag"
+ - "Report supports single-column and comparison table modes"
+
+patterns-established:
+ - "BenchmarkReport as shared metric struct across scorer/report modules"
+ - "Category filtering by test ID prefix (temporal, multi, compress)"
+
+requirements-completed: [BENCH-02, BENCH-03, BENCH-05, BENCH-08]
+
+duration: 3min
+completed: 2026-03-23
+---
+
+# Phase 53 Plan 02: Runner, Scorer, Report, and CLI Summary
+
+**Benchmark engine with scorer (accuracy/recall/compression), report generator (JSON+markdown), baseline comparison, and 5-subcommand CLI**
+
+## Performance
+
+- **Duration:** 3 min
+- **Started:** 2026-03-23T02:19:10Z
+- **Completed:** 2026-03-23T02:22:00Z
+- **Tasks:** 2
+- **Files modified:** 7
+
+## Accomplishments
+- Scorer module with accuracy, recall@k, percentile, compression ratio, token estimation (11 unit tests)
+- Runner module shelling out to memory binary for search and session ingestion
+- Report module generating JSON (round-trippable) and markdown tables with optional baseline columns
+- Baseline TOML loader for MemMachine and Mem0 competitor scores
+- CLI with temporal, multisession, compression, all, locomo subcommands and --compare flag
+- Full pipeline: load fixtures -> filter by category -> ingest sessions -> query -> score -> report
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Implement runner, scorer, report, and baseline modules** - `4d63f2b` (feat)
+2. **Task 2: Wire CLI subcommands and run pipeline** - `dc37de6` (feat)
+
+## Files Created/Modified
+- `crates/memory-bench/src/scorer.rs` - Accuracy, recall@k, percentile, compression ratio, BenchmarkReport struct
+- `crates/memory-bench/src/runner.rs` - Shell out to memory binary for queries and session ingestion
+- `crates/memory-bench/src/report.rs` - JSON and markdown report generation with baseline comparison
+- `crates/memory-bench/src/baseline.rs` - TOML loader for competitor benchmark scores
+- `crates/memory-bench/src/cli.rs` - Clap CLI with 5 subcommands and global --memory-bin flag
+- `crates/memory-bench/src/main.rs` - Full pipeline wiring: fixtures -> run -> score -> report
+- `crates/memory-bench/src/lib.rs` - Module declarations
+
+## Decisions Made
+- Runner shells out via std::process::Command (clean separation, no in-process coupling)
+- Compression ratio formula: 1.0 - (context_tokens / raw_tokens), with raw_tokens estimated as total_chars / 4
+- CLI uses clap subcommands with global --memory-bin flag for binary path override
+- Report supports both single-column (no baselines) and multi-column (with baselines) table modes
+- Category filtering done by test ID prefix matching
+
+## Deviations from Plan
+
+None - plan executed exactly as written.
+
+## Issues Encountered
+
+None
+
+## User Setup Required
+
+None - no external service configuration required.
+
+## Next Phase Readiness
+- All core modules ready for Plan 03 (LOCOMO adapter)
+- LOCOMO subcommand placeholder wired in CLI, ready for implementation
+- BenchmarkReport struct shared across scorer and report modules
+
+---
+*Phase: 53-benchmark-suite*
+*Completed: 2026-03-23*
diff --git a/.planning/phases/53-benchmark-suite/53-03-PLAN.md b/.planning/phases/53-benchmark-suite/53-03-PLAN.md
new file mode 100644
index 0000000..4da827f
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-03-PLAN.md
@@ -0,0 +1,367 @@
+---
+phase: 53-benchmark-suite
+plan: 03
+type: execute
+wave: 3
+depends_on: ["53-01", "53-02"]
+files_modified:
+ - crates/memory-bench/src/locomo.rs
+ - crates/memory-bench/src/lib.rs
+ - crates/memory-bench/src/main.rs
+ - crates/memory-bench/src/cli.rs
+ - .github/workflows/ci.yml
+autonomous: true
+requirements:
+ - BENCH-04
+ - BENCH-07
+
+must_haves:
+ truths:
+ - "LOCOMO JSON dataset files parse into typed Rust structs"
+ - "LOCOMO adapter loads conversations with 4 question types (single_hop, multi_hop, temporal, open_domain)"
+ - "LOCOMO results include per-type scores and aggregate score"
+ - "CI can run custom harness without LOCOMO (locomo subcommand requires --dataset flag)"
+ - "CI runs benchmark smoke test as non-blocking step (continue-on-error: true)"
+ artifacts:
+ - path: "crates/memory-bench/src/locomo.rs"
+ provides: "LOCOMO dataset loader, typed structs, and scoring adapter"
+ exports: ["LocomoConversation", "Turn", "Question", "LocomoResult", "load_dataset"]
+ - path: ".github/workflows/ci.yml"
+ provides: "Non-blocking benchmark CI step"
+ contains: "continue-on-error: true"
+ key_links:
+ - from: "crates/memory-bench/src/locomo.rs"
+ to: "locomo-data/*.json"
+ via: "load_dataset reads JSON files from directory"
+ pattern: "serde_json::from_str"
+ - from: "crates/memory-bench/src/main.rs"
+ to: "crates/memory-bench/src/locomo.rs"
+ via: "Commands::Locomo dispatch"
+ pattern: "locomo::load_dataset"
+ - from: ".github/workflows/ci.yml"
+ to: "cargo run -p memory-bench"
+ via: "benchmark job with continue-on-error"
+ pattern: "memory-bench.*--help"
+---
+
+
+Implement the LOCOMO adapter module and complete full QA verification for the benchmark suite.
+
+Purpose: The LOCOMO adapter enables publishable benchmark scores against the Snap Research dataset, making Agent-Memory comparable to MemMachine and Mem0. QA ensures the entire crate passes pr-precheck.
+Output: locomo.rs module with dataset loader and scorer, full QA verification passing, CI benchmark step added.
+
+
+
+@/Users/richardhightower/.claude/get-shit-done/workflows/execute-plan.md
+@/Users/richardhightower/.claude/get-shit-done/templates/summary.md
+
+
+
+@.planning/PROJECT.md
+@.planning/ROADMAP.md
+@.planning/STATE.md
+@.planning/phases/53-benchmark-suite/53-01-SUMMARY.md
+@docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md
+
+
+
+From crates/memory-bench/src/fixture.rs:
+```rust
+pub struct TestCase {
+ pub id: String,
+ pub description: String,
+ pub setup: Vec,
+ pub query: String,
+ pub expected_contains: Vec,
+ pub max_tokens: usize,
+}
+```
+
+
+From crates/memory-bench/src/scorer.rs:
+```rust
+pub fn score_result(result: &str, expected_contains: &[String]) -> bool;
+pub fn compute_accuracy(hits: &[bool]) -> f64;
+pub struct BenchmarkReport { ... }
+```
+
+From crates/memory-bench/src/runner.rs:
+```rust
+pub struct RunConfig { pub memory_bin: String }
+pub fn run_query(query: &str, config: &RunConfig) -> QueryResult;
+pub fn ingest_session(session_path: &str, config: &RunConfig) -> anyhow::Result<()>;
+```
+
+From crates/memory-bench/src/cli.rs:
+```rust
+pub struct Cli { pub command: Commands, pub memory_bin: String }
+pub enum Commands { Temporal { .. }, Multisession { .. }, Compression { .. }, All { .. }, Locomo { dataset: String, output: Option, compare: bool, baselines: String } }
+```
+
+
+
+
+
+
+ Task 1: Implement LOCOMO adapter with dataset loader and scorer
+
+ crates/memory-bench/src/locomo.rs,
+ crates/memory-bench/src/lib.rs
+
+
+ docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md (Task 6 — locomo.rs code),
+ crates/memory-bench/src/lib.rs,
+ crates/memory-bench/src/scorer.rs (for score_result function signature)
+
+
+1. Create `crates/memory-bench/src/locomo.rs`:
+
+```rust
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+use anyhow::Result;
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct LocomoConversation {
+ pub conversation_id: String,
+ pub turns: Vec,
+ pub questions: Vec,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct Turn {
+ pub role: String,
+ pub content: String,
+}
+
+#[derive(Debug, Deserialize, Clone)]
+pub struct Question {
+ pub question: String,
+ pub answer: String,
+ #[serde(rename = "type")]
+ pub question_type: String, // single_hop, multi_hop, temporal, open_domain
+}
+
+#[derive(Debug, Serialize)]
+pub struct LocomoResult {
+ pub conversation_id: String,
+ pub total_questions: usize,
+ pub correct: usize,
+ pub score: f64,
+ pub by_type: std::collections::HashMap,
+}
+
+#[derive(Debug, Serialize)]
+pub struct TypeScore {
+ pub total: usize,
+ pub correct: usize,
+ pub score: f64,
+}
+
+#[derive(Debug, Serialize)]
+pub struct LocomoAggregateResult {
+ pub conversations: usize,
+ pub total_questions: usize,
+ pub overall_score: f64,
+ pub by_type: std::collections::HashMap,
+ pub per_conversation: Vec,
+}
+
+/// Load all LOCOMO conversations from a dataset directory.
+/// Reads all .json files in the directory.
+pub fn load_dataset(dir: &Path) -> Result> {
+ let mut conversations = Vec::new();
+ for entry in std::fs::read_dir(dir)? {
+ let path = entry?.path();
+ if path.extension().map(|e| e == "json").unwrap_or(false) {
+ let content = std::fs::read_to_string(&path)?;
+ let conv: LocomoConversation = serde_json::from_str(&content)?;
+ conversations.push(conv);
+ }
+ }
+ Ok(conversations)
+}
+
+/// Score a single conversation's questions against retrieved results.
+/// Uses case-insensitive substring matching (same as custom harness scorer).
+pub fn score_conversation(conv: &LocomoConversation, answers: &[String]) -> LocomoResult {
+ // For each question, check if the corresponding answer contains the gold answer
+ // Group scores by question_type (single_hop, multi_hop, temporal, open_domain)
+ // Return LocomoResult with per-type breakdown
+}
+
+/// Aggregate results across all conversations.
+pub fn aggregate_results(results: &[LocomoResult]) -> LocomoAggregateResult {
+ // Sum total_questions, correct across all conversations
+ // Aggregate by_type scores
+ // Compute overall_score as correct / total_questions
+}
+```
+
+2. Add tests in locomo.rs:
+- `test_locomo_conversation_parses` — parse inline JSON with conversation_id, turns, questions; assert questions.len() == 1, question_type == "single_hop"
+- `test_locomo_conversation_multiple_types` — parse JSON with 4 questions of different types; assert all 4 question_types present
+- `test_score_conversation_all_correct` — create conv + answers where all match; assert score == 1.0
+- `test_score_conversation_partial` — create conv + answers where 2/4 match; assert score == 0.5
+- `test_aggregate_results` — aggregate 2 LocomoResults; verify total and by_type scores
+- `test_load_dataset_from_dir` — create temp dir with 2 JSON files, load_dataset returns 2 conversations
+
+3. Add `pub mod locomo;` to `crates/memory-bench/src/lib.rs`.
+
+4. Run: `cargo test -p memory-bench locomo` — all tests pass.
+5. Run: `cargo clippy -p memory-bench -- -D warnings` — no warnings.
+
+
+ - Test: LOCOMO JSON parses into LocomoConversation with correct fields
+ - Test: Multiple question types (single_hop, multi_hop, temporal, open_domain) parse correctly
+ - Test: score_conversation returns 1.0 when all answers match
+ - Test: score_conversation returns correct partial score
+ - Test: aggregate_results computes correct totals across multiple conversations
+ - Test: load_dataset reads .json files from a temp directory
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo test -p memory-bench locomo 2>&1 | tail -15
+
+
+ - grep -q "pub struct LocomoConversation" crates/memory-bench/src/locomo.rs
+ - grep -q "pub struct LocomoResult" crates/memory-bench/src/locomo.rs
+ - grep -q "pub struct LocomoAggregateResult" crates/memory-bench/src/locomo.rs
+ - grep -q "pub fn load_dataset" crates/memory-bench/src/locomo.rs
+ - grep -q "pub fn score_conversation" crates/memory-bench/src/locomo.rs
+ - grep -q "pub fn aggregate_results" crates/memory-bench/src/locomo.rs
+ - grep -q "question_type" crates/memory-bench/src/locomo.rs
+ - grep -q "pub mod locomo" crates/memory-bench/src/lib.rs
+ - cargo test -p memory-bench locomo passes (6+ tests)
+
+ LOCOMO adapter parses dataset JSON, scores conversations by question type, aggregates results with per-type breakdown. All tests pass, clippy clean.
+
+
+
+ Task 2: Wire locomo subcommand, full QA, and add CI benchmark step
+
+ crates/memory-bench/src/main.rs,
+ crates/memory-bench/src/cli.rs,
+ .github/workflows/ci.yml
+
+
+ crates/memory-bench/src/main.rs (to wire locomo subcommand — cli.rs is now available from Plan 02),
+ crates/memory-bench/src/locomo.rs,
+ crates/memory-bench/src/cli.rs,
+ .github/workflows/ci.yml (to append benchmark job)
+
+
+1. Ensure the `Commands::Locomo` branch in main.rs is wired to the locomo module:
+```rust
+cli::Commands::Locomo { dataset, output, compare, baselines } => {
+ let conversations = locomo::load_dataset(std::path::Path::new(&dataset))?;
+ eprintln!("Loaded {} conversations from {}", conversations.len(), dataset);
+ // For each conversation:
+ // - Convert turns to JSONL, ingest via runner
+ // - Run each question through runner::run_query
+ // - Score via locomo::score_conversation
+ // Aggregate via locomo::aggregate_results
+ // If --compare, load baselines and include in report
+ let aggregate = locomo::aggregate_results(&results);
+ let json = serde_json::to_string_pretty(&aggregate)?;
+ println!("{json}");
+ if let Some(path) = output {
+ std::fs::write(&path, &json)?;
+ eprintln!("Results written to {path}");
+ }
+}
+```
+
+2. Add a non-blocking benchmark smoke-test job to `.github/workflows/ci.yml`.
+
+ Read the existing ci.yml first to find the correct indentation and job block structure.
+ Append the following job at the end of the `jobs:` section (same indentation as existing jobs):
+
+```yaml
+ benchmark-smoke:
+ name: Benchmark Suite Smoke Test
+ runs-on: ubuntu-latest
+ continue-on-error: true
+ needs: [test]
+ steps:
+ - uses: actions/checkout@v4
+ - uses: dtolnay/rust-toolchain@stable
+ - uses: Swatinem/rust-cache@v2
+ - name: Build memory-bench
+ run: cargo build -p memory-bench
+ - name: Smoke test (help only — no daemon required)
+ run: |
+ cargo run -p memory-bench -- --help
+ cargo run -p memory-bench -- all --help
+ cargo run -p memory-bench -- locomo --help
+```
+
+ Key requirements:
+ - `continue-on-error: true` — benchmark failures MUST NOT block CI (no daemon available in CI)
+ - `needs: [test]` — runs after the main test job (adjust job name to match existing ci.yml)
+ - Only runs `--help` checks, not actual benchmarks (avoids needing memory daemon)
+
+3. Run full workspace test suite:
+```bash
+cargo test --workspace --all-features
+```
+Ensure no regressions in other crates.
+
+4. Run pr-precheck equivalent:
+```bash
+cargo fmt --all -- --check
+cargo clippy --workspace --all-targets --all-features -- -D warnings
+cargo test --workspace --all-features
+RUSTDOCFLAGS="-D warnings" cargo doc --no-deps --workspace --all-features
+```
+
+5. Verify `locomo-data/` is gitignored:
+```bash
+mkdir -p locomo-data && git status | grep -v locomo
+```
+
+6. Fix any clippy warnings, formatting issues, or doc warnings in memory-bench crate.
+
+7. Verify `cargo run -p memory-bench -- locomo --help` shows --dataset, --output, --compare flags.
+
+
+ cd /Users/richardhightower/clients/spillwave/src/agent-memory && cargo fmt --all -- --check && cargo clippy --workspace --all-targets --all-features -- -D warnings && cargo test -p memory-bench 2>&1 | tail -10
+
+
+ - cargo fmt --all -- --check passes
+ - cargo clippy --workspace --all-targets --all-features -- -D warnings passes
+ - cargo test -p memory-bench passes all tests
+ - cargo test --workspace --all-features passes (no regressions)
+ - cargo run -p memory-bench -- locomo --help shows --dataset flag
+ - locomo-data/ directory is not tracked by git
+ - grep -q "continue-on-error: true" .github/workflows/ci.yml
+ - grep -q "memory-bench" .github/workflows/ci.yml
+
+ Full benchmark suite passes pr-precheck. LOCOMO subcommand wired. No regressions in workspace. CI benchmark smoke test added as non-blocking job (continue-on-error: true). Custom harness runs without daemon; LOCOMO requires --dataset flag.
+
+
+
+
+
+- `cargo test -p memory-bench` passes all tests (fixture, scorer, baseline, report, locomo)
+- `cargo fmt --all -- --check` passes
+- `cargo clippy --workspace --all-targets --all-features -- -D warnings` passes
+- `cargo test --workspace --all-features` passes (no regressions)
+- `RUSTDOCFLAGS="-D warnings" cargo doc --no-deps --workspace --all-features` passes
+- `cargo run -p memory-bench -- locomo --help` shows --dataset, --output, --compare flags
+- `.github/workflows/ci.yml` contains benchmark-smoke job with `continue-on-error: true`
+
+
+
+- LOCOMO adapter parses dataset JSON into typed structs
+- score_conversation scores answers against gold with per-type breakdown
+- aggregate_results produces overall + per-type scores
+- Full pr-precheck passes for the entire workspace
+- LOCOMO subcommand wired in main.rs with --dataset flag required
+- CI can run `cargo test -p memory-bench` without LOCOMO dataset
+- CI benchmark smoke test job runs --help checks only, marked continue-on-error: true
+- locomo-data/ confirmed gitignored
+
+
+
diff --git a/.planning/phases/53-benchmark-suite/53-03-SUMMARY.md b/.planning/phases/53-benchmark-suite/53-03-SUMMARY.md
new file mode 100644
index 0000000..4e0a8cd
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-03-SUMMARY.md
@@ -0,0 +1,125 @@
+---
+phase: 53-benchmark-suite
+plan: 03
+subsystem: testing
+tags: [benchmark, locomo, serde, ci, smoke-test]
+
+requires:
+ - phase: 53-benchmark-suite (plan 01)
+ provides: "Fixture loader, TestCase structs, TOML format"
+ - phase: 53-benchmark-suite (plan 02)
+ provides: "Scorer, runner, baseline, report, CLI subcommands"
+provides:
+ - "LOCOMO dataset adapter with typed structs and scoring"
+ - "Case-insensitive substring scoring with per-type breakdown"
+ - "CI benchmark smoke test job (non-blocking)"
+affects: [benchmark-evaluation, ci-pipeline]
+
+tech-stack:
+ added: [tempfile (runtime dependency)]
+ patterns: [per-type-breakdown scoring, case-insensitive gold-answer matching, continue-on-error CI jobs]
+
+key-files:
+ created:
+ - crates/memory-bench/src/locomo.rs
+ modified:
+ - crates/memory-bench/src/lib.rs
+ - crates/memory-bench/src/main.rs
+ - crates/memory-bench/Cargo.toml
+ - .github/workflows/ci.yml
+
+key-decisions:
+ - "LOCOMO scoring uses case-insensitive substring matching (same as custom harness scorer)"
+ - "Baselines struct has named fields (memmachine, mem0) not a competitors vec"
+ - "tempfile moved from dev-dependencies to dependencies for runtime JSONL session creation"
+
+patterns-established:
+ - "LOCOMO adapter: load_dataset -> score_conversation -> aggregate_results pipeline"
+ - "CI smoke test: --help only checks, continue-on-error: true"
+
+requirements-completed: [BENCH-04, BENCH-07]
+
+duration: 5min
+completed: 2026-03-23
+---
+
+# Phase 53 Plan 03: LOCOMO Adapter & QA Summary
+
+**LOCOMO adapter with typed dataset parsing, per-type scoring (single_hop/multi_hop/temporal/open_domain), and CI benchmark smoke test**
+
+## Performance
+
+- **Duration:** 5 min
+- **Started:** 2026-03-23T02:24:03Z
+- **Completed:** 2026-03-23T02:29:01Z
+- **Tasks:** 2
+- **Files modified:** 6
+
+## Accomplishments
+- LOCOMO adapter parses Snap Research dataset JSON into typed Rust structs
+- Score conversations with case-insensitive substring matching and per-type breakdown
+- Aggregate results across multiple conversations with overall + per-type scores
+- Full pr-precheck passes (fmt, clippy, test, doc) with 24 memory-bench tests
+- CI benchmark-smoke job added with continue-on-error: true (--help only)
+
+## Task Commits
+
+Each task was committed atomically:
+
+1. **Task 1: Implement LOCOMO adapter with dataset loader and scorer** - `3fb2fa6` (feat)
+2. **Task 2: Wire locomo subcommand, full QA, and add CI benchmark step** - `90ccbfb` (feat)
+
+## Files Created/Modified
+- `crates/memory-bench/src/locomo.rs` - LOCOMO dataset loader, typed structs, scoring, aggregation, 6 tests
+- `crates/memory-bench/src/lib.rs` - Added pub mod locomo
+- `crates/memory-bench/src/main.rs` - Wired Commands::Locomo with ingestion and scoring pipeline
+- `crates/memory-bench/Cargo.toml` - Moved tempfile to runtime dependency
+- `.github/workflows/ci.yml` - Added benchmark-smoke job with continue-on-error: true
+- `crates/memory-bench/src/fixture.rs` - Formatting fixes (cargo fmt)
+- `crates/memory-bench/src/report.rs` - Formatting fixes (cargo fmt)
+
+## Decisions Made
+- LOCOMO scoring uses case-insensitive substring matching, consistent with custom harness scorer
+- tempfile moved from dev-dependencies to dependencies since locomo subcommand creates temp JSONL session files at runtime
+- Baselines struct uses named fields (memmachine, mem0) not a dynamic competitors vec -- fixed reference in locomo wiring
+
+## Deviations from Plan
+
+### Auto-fixed Issues
+
+**1. [Rule 1 - Bug] Fixed Baselines field reference**
+- **Found during:** Task 2 (Wire locomo subcommand)
+- **Issue:** Plan referenced `baselines_data.competitors.len()` but Baselines struct has named fields (memmachine, mem0)
+- **Fix:** Changed to simple log message without field count
+- **Files modified:** crates/memory-bench/src/main.rs
+- **Verification:** cargo clippy passes
+- **Committed in:** 90ccbfb (Task 2 commit)
+
+**2. [Rule 3 - Blocking] Moved tempfile to runtime dependency**
+- **Found during:** Task 2 (Wire locomo subcommand)
+- **Issue:** tempfile was dev-only but needed at runtime for JSONL session temp files
+- **Fix:** Moved from [dev-dependencies] to [dependencies] in Cargo.toml
+- **Files modified:** crates/memory-bench/Cargo.toml
+- **Verification:** cargo build succeeds
+- **Committed in:** 90ccbfb (Task 2 commit)
+
+---
+
+**Total deviations:** 2 auto-fixed (1 bug, 1 blocking)
+**Impact on plan:** Both fixes necessary for correctness. No scope creep.
+
+## Issues Encountered
+None beyond the auto-fixed deviations above.
+
+## User Setup Required
+None - no external service configuration required.
+
+## Next Phase Readiness
+- Phase 53 (benchmark-suite) complete: all 3 plans executed
+- Custom harness with TOML fixtures, scorer, baseline comparison, and LOCOMO adapter fully operational
+- CI runs benchmark smoke test on every PR (non-blocking)
+- Ready for v3.0 milestone completion
+
+---
+*Phase: 53-benchmark-suite*
+*Completed: 2026-03-23*
diff --git a/.planning/phases/53-benchmark-suite/53-CONTEXT.md b/.planning/phases/53-benchmark-suite/53-CONTEXT.md
new file mode 100644
index 0000000..2c2961c
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-CONTEXT.md
@@ -0,0 +1,107 @@
+# Phase 53: Benchmark Suite - Context
+
+**Gathered:** 2026-03-22
+**Status:** Ready for planning
+**Source:** PRD Express Path (docs/superpowers/specs/2026-03-21-v3-competitive-parity-design.md + docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md)
+
+
+## Phase Boundary
+
+This phase creates a two-part benchmark system: a custom harness for internal metrics (temporal recall, multi-session reasoning, compression efficiency) and a LOCOMO adapter for publishable, comparable scores against MemMachine and Mem0. New `crates/memory-bench/` crate with `memory-bench` binary. All reports output JSON + markdown.
+
+
+
+
+## Implementation Decisions
+
+### Architecture
+- New crate `crates/memory-bench/` with `[[bin]]` producing `memory-bench` binary
+- Custom harness loads TOML fixture files from `benchmarks/fixtures/`
+- Runner shells out to `memory` binary (Phase 52) via `std::process::Command` — NOT in-process
+- LOCOMO adapter wraps the same runner pipeline for the Snap Research dataset
+- Competitor baselines stored in `benchmarks/baselines.toml` (manually-entered, not scraped)
+
+### Sub-phase C1: Custom Harness
+- Subcommands: `temporal`, `multisession`, `compression`, `all`
+- TOML fixture format with `[[test]]` entries containing: id, description, setup (JSONL paths), query, expected_contains, max_tokens
+- Metrics: accuracy, recall@5, token_usage (avg), latency_p50/p95, compression_ratio
+- `--compare` flag reads baselines.toml and prints side-by-side table
+- `--output` flag writes results.json
+
+### Sub-phase C2: LOCOMO Adapter
+- Subcommand: `locomo --dataset=./locomo-data/ --output=results.json`
+- LOCOMO dataset (Snap Research, ~300-turn multi-session conversations, 4 question types)
+- Dataset downloaded separately via `benchmarks/scripts/download-locomo.sh`
+- `locomo-data/` in `.gitignore` — never committed
+- Adapter feeds conversations through ingestion, runs 4 question types through orchestrator, scores against gold answers
+- `--compare=memmachine` reads baselines.toml
+
+### Fixture Files
+- `benchmarks/fixtures/temporal-001.toml` — temporal recall tests
+- `benchmarks/fixtures/multisession-001.toml` — multi-session reasoning tests
+- `benchmarks/fixtures/compression-001.toml` — token compression tests
+- `benchmarks/fixtures/sessions/*.jsonl` — stub session data for ingestion
+- `benchmarks/baselines.toml` — MemMachine and Mem0 manually-entered scores
+
+### CI Integration
+- CI runs benchmark suite (non-blocking — not merge-blocking)
+- LOCOMO skipped without `--dataset` flag (flag required to activate)
+- Custom harness can run without daemon (fixture-only mode for CI)
+
+### Report Output
+- JSON report with all metrics (machine-readable)
+- Markdown report with formatted table (human-readable, publishable)
+- Both formats available for all benchmark types
+
+### Claude's Discretion
+- Whether runner needs daemon running or can operate in fixture-only stub mode for CI
+- How to handle missing JSONL session files in fixtures (skip vs fail)
+- Whether to add a `--memory-bin` flag to override binary path (useful for CI)
+- Exact LOCOMO JSON schema adaptation (varies by dataset version)
+
+
+
+
+## Canonical References
+
+**Downstream agents MUST read these before planning or implementing.**
+
+### Spec & Plans
+- `docs/superpowers/specs/2026-03-21-v3-competitive-parity-design.md` — Full v3.0 design spec (Phase C section)
+- `docs/superpowers/plans/2026-03-21-v3-phase-c-benchmark-suite.md` — Detailed implementation plan with 7 tasks, code snippets, TOML fixtures
+
+### Phase 52 CLI (Dependency)
+- `crates/memory-cli/src/main.rs` — `memory` binary that runner shells out to
+- `crates/memory-cli/src/output.rs` — `JsonEnvelope` format that runner parses
+
+### Existing Benchmark Infrastructure
+- `crates/e2e-tests/src/perf_bench.rs` — Existing perf_bench harness (v2.3, separate from this)
+
+
+
+
+## Specific Ideas
+
+- The implementation plan has 7 tasks with complete Rust code snippets
+- Fixture TOML format is fully specified with 3 sample fixtures
+- Runner uses `std::process::Command` to shell out to `memory search --format=json`
+- Scorer does case-insensitive substring matching against `expected_contains`
+- Report generator produces both JSON and markdown table formats
+- LOCOMO adapter has typed structs for the Snap Research JSON format
+- Baseline comparison reads `benchmarks/baselines.toml` and formats side-by-side
+
+
+
+
+## Deferred Ideas
+
+- Continuous benchmark regression tracking in CI (BENCH-F01) — future milestone
+- Automated dataset refresh/download — manual for now
+- Side quest: positioning writeup (`docs/positioning/agent-memory-vs-competition.md`) — not a GSD phase, done alongside or after Phase C
+
+
+
+---
+
+*Phase: 53-benchmark-suite*
+*Context gathered: 2026-03-22 via PRD Express Path*
diff --git a/.planning/phases/53-benchmark-suite/53-VERIFICATION.md b/.planning/phases/53-benchmark-suite/53-VERIFICATION.md
new file mode 100644
index 0000000..e43632b
--- /dev/null
+++ b/.planning/phases/53-benchmark-suite/53-VERIFICATION.md
@@ -0,0 +1,129 @@
+---
+phase: 53-benchmark-suite
+verified: 2026-03-22T00:00:00Z
+status: passed
+score: 17/17 must-haves verified
+re_verification: false
+gaps: []
+human_verification:
+ - test: "Run memory-bench all against a live memory daemon"
+ expected: "Fixture ingestion, query, scoring, and markdown report output with real latency values"
+ why_human: "Requires running memory daemon; CI smoke test is --help only"
+ - test: "Run memory-bench locomo --dataset=./locomo-data/ after downloading the LOCOMO dataset"
+ expected: "Conversations ingested, questions scored, aggregate JSON printed with overall_score and per-type breakdown"
+ why_human: "LOCOMO dataset not committed; requires manual download via download-locomo.sh"
+---
+
+# Phase 53: Benchmark Suite Verification Report
+
+**Phase Goal:** Users can measure and compare Agent Memory retrieval quality with reproducible benchmarks and a publishable LOCOMO score
+**Verified:** 2026-03-22
+**Status:** PASSED
+**Re-verification:** No — initial verification
+
+## Goal Achievement
+
+### Observable Truths
+
+| # | Truth | Status | Evidence |
+|---|-------|--------|----------|
+| 1 | memory-bench crate compiles as part of the workspace | VERIFIED | `cargo build -p memory-bench` succeeds; "crates/memory-bench" in root Cargo.toml |
+| 2 | TOML fixture files parse into typed Rust structs | VERIFIED | `fixture.rs` defines `Fixture`/`TestCase`; 4 passing tests including `test_fixture_parses_valid_toml` |
+| 3 | Fixture loader validates required fields and rejects invalid fixtures | VERIFIED | `Fixture::load` bails on empty id or empty query; tests `test_fixture_validates_empty_id` and `test_fixture_validates_empty_query` pass |
+| 4 | locomo-data/ is gitignored | VERIFIED | `locomo-data/` entry confirmed in `.gitignore` |
+| 5 | Baseline competitor scores are stored in benchmarks/baselines.toml | VERIFIED | `[memmachine]` with `locomo_score = 0.91` and `[mem0]` with scores present |
+| 6 | Runner shells out to memory binary and captures JSON output + latency | VERIFIED | `runner.rs` uses `Command::new(&config.memory_bin).args(["search", query, "--format=json"])`, captures elapsed time, parses `meta.tokens_estimated` |
+| 7 | Scorer computes accuracy and recall@5 from expected_contains matching | VERIFIED | `scorer.rs` exports `score_result`, `compute_accuracy`, `compute_recall_at_k`; 11 passing unit tests |
+| 8 | Scorer computes compression_ratio from context_tokens vs raw_tokens | VERIFIED | `compute_compression_ratio(context_tokens, raw_tokens)` and `estimate_raw_tokens` implemented; tests pass |
+| 9 | Report generates both JSON and markdown table formats | VERIFIED | `report.rs` exports `to_json` and `to_markdown`; round-trip test and header test pass |
+| 10 | Baseline loader reads benchmarks/baselines.toml into typed structs | VERIFIED | `baseline.rs` defines `Baselines`/`CompetitorScore`; `test_baselines_load` asserts `memmachine.locomo_score = Some(0.91)` |
+| 11 | CLI exposes temporal, multisession, compression, all, and locomo subcommands | VERIFIED | `cargo run -p memory-bench -- --help` shows all 5 subcommands; clap definitions in `cli.rs` |
+| 12 | --compare flag reads baselines and prints side-by-side table | VERIFIED | `all` and `locomo` subcommands both have `--compare` flag; `to_markdown` generates 4-column table when `baselines` is `Some` |
+| 13 | LOCOMO JSON dataset files parse into typed Rust structs | VERIFIED | `locomo.rs` defines `LocomoConversation`, `Turn`, `Question`; `test_locomo_conversation_parses` and `test_locomo_conversation_multiple_types` pass |
+| 14 | LOCOMO adapter loads conversations with 4 question types | VERIFIED | `question_type` field supports single_hop, multi_hop, temporal, open_domain; test verifies all 4 types parse |
+| 15 | LOCOMO results include per-type scores and aggregate score | VERIFIED | `LocomoResult.by_type` HashMap + `LocomoAggregateResult.by_type`; `test_aggregate_results` verifies per-type sums |
+| 16 | CI can run custom harness without LOCOMO (locomo subcommand requires --dataset flag) | VERIFIED | `--dataset` is required arg (no default); `--help` checks only in CI smoke test |
+| 17 | CI runs benchmark smoke test as non-blocking step (continue-on-error: true) | VERIFIED | `benchmark-smoke` job at line 183 of ci.yml with `continue-on-error: true` and `needs: [test]` |
+
+**Score:** 17/17 truths verified
+
+### Required Artifacts
+
+| Artifact | Expected | Status | Details |
+|----------|----------|--------|---------|
+| `crates/memory-bench/Cargo.toml` | Crate manifest with workspace deps | VERIFIED | Contains "memory-bench"; clap, tokio, serde, toml, thiserror, anyhow, tracing wired |
+| `crates/memory-bench/src/fixture.rs` | TOML fixture loader with validation | VERIFIED | `Fixture`, `TestCase` exported; `load` validates id/query; `load_dir` sorts entries |
+| `crates/memory-bench/src/scorer.rs` | Result scoring functions + BenchmarkReport | VERIFIED | `score_result`, `compute_accuracy`, `compute_recall_at_k`, `percentile`, `compute_compression_ratio`, `estimate_raw_tokens`, `BenchmarkReport` all present |
+| `crates/memory-bench/src/runner.rs` | Shell-out runner for memory binary | VERIFIED | `run_query` uses `std::process::Command`; `ingest_session` reads JSONL; `QueryResult` captures latency + tokens |
+| `crates/memory-bench/src/report.rs` | JSON + markdown report generation | VERIFIED | `to_json` round-trips; `to_markdown` generates single-column or 4-column comparison table |
+| `crates/memory-bench/src/baseline.rs` | Competitor baseline TOML loader | VERIFIED | `Baselines`, `CompetitorScore` defined; `load` reads TOML from path |
+| `crates/memory-bench/src/cli.rs` | Clap CLI definition with all subcommands | VERIFIED | `Cli`, `Commands` with Temporal/Multisession/Compression/All/Locomo variants |
+| `crates/memory-bench/src/locomo.rs` | LOCOMO dataset loader and scorer | VERIFIED | `LocomoConversation`, `Turn`, `Question`, `LocomoResult`, `LocomoAggregateResult`; `load_dataset`, `score_conversation`, `aggregate_results` all implemented with 6 tests |
+| `crates/memory-bench/src/main.rs` | Full pipeline wiring | VERIFIED | All 5 CLI commands dispatch to correct modules; `run_category`, `run_all`, `run_tests` implement the scoring pipeline end-to-end |
+| `crates/memory-bench/src/lib.rs` | Module declarations | VERIFIED | Exports `baseline`, `fixture`, `locomo`, `report`, `runner`, `scorer` |
+| `benchmarks/fixtures/temporal-001.toml` | Temporal recall test fixtures | VERIFIED | Contains `[[test]]` with id temporal-001 and temporal-002 |
+| `benchmarks/fixtures/multisession-001.toml` | Multi-session test fixtures | VERIFIED | Contains `[[test]]` with id multi-001 |
+| `benchmarks/fixtures/compression-001.toml` | Compression test fixtures | VERIFIED | Contains `[[test]]` with id compress-001 |
+| `benchmarks/fixtures/sessions/*.jsonl` | 7 JSONL session stub files | VERIFIED | auth-decision (6 lines), bug-fix, follow-up, session-a/b/c, long-session (30 lines) all present |
+| `benchmarks/baselines.toml` | Competitor baseline scores | VERIFIED | `[memmachine]` locomo_score=0.91; `[mem0]` accuracy_vs_openai_memory=0.26 |
+| `benchmarks/scripts/download-locomo.sh` | LOCOMO download script | VERIFIED | Executable (`chmod +x`); curl + unzip pipeline present |
+| `.github/workflows/ci.yml` | Non-blocking benchmark CI step | VERIFIED | `benchmark-smoke` job with `continue-on-error: true`, `needs: [test]`, runs --help only |
+
+### Key Link Verification
+
+| From | To | Via | Status | Details |
+|------|----|-----|--------|---------|
+| `fixture.rs` | `benchmarks/fixtures/*.toml` | `toml::from_str` in `Fixture::load` | WIRED | `Fixture::load` and `Fixture::load_dir` both call `toml::from_str`; tests use temp fixtures |
+| `Cargo.toml` (root) | `crates/memory-bench` | workspace members | WIRED | `"crates/memory-bench"` present in members array |
+| `runner.rs` | memory binary | `Command::new(&config.memory_bin).args(["search", ...])` | WIRED | Line 34-36 of runner.rs; graceful fallback when binary not found |
+| `scorer.rs` | `runner.rs` | `score_result` called on `result.raw_output` | WIRED | `main.rs` line 144: `scorer::score_result(&result.raw_output, &test.expected_contains)` |
+| `scorer.rs` | fixture setup paths | `compute_compression_ratio` uses `estimate_raw_tokens(&test.setup)` | WIRED | `main.rs` line 150: `scorer::estimate_raw_tokens(&test.setup)` |
+| `report.rs` | `scorer.rs` | `BenchmarkReport` struct passed to `to_json`/`to_markdown` | WIRED | `report.rs` imports `crate::scorer::BenchmarkReport` |
+| `main.rs` | `cli.rs` | Clap Parser dispatch via `Commands::` | WIRED | `main.rs` line 2-3: `mod cli; use cli::Cli; cli::Cli::parse()` |
+| `locomo.rs` | JSON dataset files | `serde_json::from_str` in `load_dataset` | WIRED | `locomo.rs` line 97; `test_load_dataset_from_dir` confirms with temp dir |
+| `main.rs` | `locomo.rs` | `Commands::Locomo` dispatch | WIRED | `main.rs` line 53: `locomo::load_dataset(...)`, `locomo::score_conversation(...)`, `locomo::aggregate_results(...)` |
+| `ci.yml` | `cargo run -p memory-bench` | benchmark-smoke job | WIRED | Lines 204-211 of ci.yml: build + help checks for --help, all --help, locomo --help |
+
+### Requirements Coverage
+
+| Requirement | Source Plan | Description | Status | Evidence |
+|-------------|------------|-------------|--------|----------|
+| BENCH-01 | 53-01-PLAN.md | Custom benchmark harness with TOML fixture files (temporal, multisession, compression) | SATISFIED | `fixture.rs` + 3 `.toml` fixture files + 7 session stubs verified on disk |
+| BENCH-02 | 53-02-PLAN.md | `memory benchmark temporal/multisession/compression/all` subcommands | SATISFIED | CLI exposes all subcommands; `cargo run -- --help` confirms all present |
+| BENCH-03 | 53-02-PLAN.md | Benchmark reports accuracy, recall@5, token_usage, latency_p50/p95, compression ratio | SATISFIED | `BenchmarkReport` struct and all 6 metric fields verified in `scorer.rs` |
+| BENCH-04 | 53-03-PLAN.md | LOCOMO adapter ingests Snap Research dataset and produces results.json with aggregate score | SATISFIED | `locomo.rs` with `load_dataset`, `score_conversation`, `aggregate_results`; JSON output in main.rs |
+| BENCH-05 | 53-02-PLAN.md | --compare flag reads benchmarks/baselines.toml and prints side-by-side competitor table | SATISFIED | `--compare` flag on `all` and `locomo` subcommands; `to_markdown` generates 4-column table |
+| BENCH-06 | 53-01-PLAN.md | locomo-data/ in .gitignore — dataset never committed | SATISFIED | `locomo-data/` confirmed in `.gitignore` |
+| BENCH-07 | 53-03-PLAN.md | CI runs benchmark suite (non-blocking, skips LOCOMO without --dataset flag) | SATISFIED | `benchmark-smoke` CI job with `continue-on-error: true`; --dataset is required arg with no default |
+| BENCH-08 | 53-02-PLAN.md | JSON + markdown report output for all benchmark types | SATISFIED | `to_json` and `to_markdown` in `report.rs`; both invoked from main.rs for all subcommands |
+
+All 8 requirements mapped. No orphaned requirements detected.
+
+### Anti-Patterns Found
+
+No anti-patterns detected in any memory-bench source file. No TODO/FIXME/placeholder comments, no empty return stubs, no console-only handlers.
+
+### Human Verification Required
+
+#### 1. Live Benchmark Run Against Memory Daemon
+
+**Test:** Start memory daemon, then run `cargo run -p memory-bench -- all --fixtures benchmarks/fixtures`
+**Expected:** Fixture JSONL sessions ingest via `memory add`, queries execute via `memory search --format=json`, BenchmarkReport printed as markdown table with real latency/token values
+**Why human:** Requires running memory daemon; CI smoke test only verifies --help output, not actual benchmark execution
+
+#### 2. LOCOMO Dataset Benchmark
+
+**Test:** Download LOCOMO dataset via `benchmarks/scripts/download-locomo.sh`, then run `cargo run -p memory-bench -- locomo --dataset=./locomo-data/ --compare`
+**Expected:** Conversations loaded and ingested, questions scored against memory search results, aggregate JSON with `overall_score` and per-type breakdown (single_hop, multi_hop, temporal, open_domain) printed; comparison table shows Agent-Memory vs MemMachine vs Mem0
+**Why human:** LOCOMO dataset is gitignored and requires separate download; publishable score depends on real retrieval quality
+
+### Gaps Summary
+
+No gaps. All 17 must-have truths verified. All 8 requirement IDs (BENCH-01 through BENCH-08) are satisfied with concrete implementation evidence. All key links are wired. 24 unit tests pass. Clippy passes with no warnings.
+
+The two human verification items are not blockers — they require external runtime dependencies (memory daemon, LOCOMO dataset) that cannot be verified programmatically.
+
+---
+
+_Verified: 2026-03-22_
+_Verifier: Claude (gsd-verifier)_
diff --git a/Cargo.toml b/Cargo.toml
index 535765a..45bd645 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ members = [
"crates/memory-installer",
"crates/memory-orchestrator",
"crates/memory-cli",
+ "crates/memory-bench",
]
[workspace.package]
diff --git a/benchmarks/baselines.toml b/benchmarks/baselines.toml
new file mode 100644
index 0000000..4d16044
--- /dev/null
+++ b/benchmarks/baselines.toml
@@ -0,0 +1,14 @@
+# Manually-maintained competitor benchmark scores.
+# Sources listed per entry.
+
+[memmachine]
+# Source: https://memmachine.ai/blog/2025/12/memmachine-v0.2-delivers-top-scores-and-efficiency-on-locomo-benchmark/
+locomo_score = 0.91
+token_reduction = 0.80
+latency_improvement = 0.75
+
+[mem0]
+# Source: https://mem0.ai/research
+accuracy_vs_openai_memory = 0.26
+token_reduction = 0.90
+latency_reduction = 0.91
diff --git a/benchmarks/fixtures/compression-001.toml b/benchmarks/fixtures/compression-001.toml
new file mode 100644
index 0000000..c4313e1
--- /dev/null
+++ b/benchmarks/fixtures/compression-001.toml
@@ -0,0 +1,7 @@
+[[test]]
+id = "compress-001"
+description = "Verify context is compressed vs raw session dump"
+setup = ["sessions/long-session.jsonl"]
+query = "summarize the key decisions from this project"
+expected_contains = ["decision", "architecture"]
+max_tokens = 800
diff --git a/benchmarks/fixtures/multisession-001.toml b/benchmarks/fixtures/multisession-001.toml
new file mode 100644
index 0000000..bfe377f
--- /dev/null
+++ b/benchmarks/fixtures/multisession-001.toml
@@ -0,0 +1,7 @@
+[[test]]
+id = "multi-001"
+description = "Connect a decision from session A with an outcome from session B"
+setup = ["sessions/session-a.jsonl", "sessions/session-b.jsonl", "sessions/session-c.jsonl"]
+query = "what was the outcome of the approach we chose last week?"
+expected_contains = ["performance", "latency"]
+max_tokens = 600
diff --git a/benchmarks/fixtures/sessions/auth-decision.jsonl b/benchmarks/fixtures/sessions/auth-decision.jsonl
new file mode 100644
index 0000000..ba4ce05
--- /dev/null
+++ b/benchmarks/fixtures/sessions/auth-decision.jsonl
@@ -0,0 +1,6 @@
+{"role":"user","content":"We need to decide on an authentication approach for the API. What are our options?"}
+{"role":"assistant","content":"We have three main options: 1) JWT tokens with refresh rotation, 2) OAuth2 with a third-party provider, 3) Session-based auth with server-side storage. Given our microservices architecture, JWT is the most common choice."}
+{"role":"user","content":"JWT sounds right. What about token expiry and refresh?"}
+{"role":"assistant","content":"I recommend short-lived access tokens (15 minutes) with longer refresh tokens (7 days). We should implement refresh token rotation where each refresh invalidates the old token and issues a new pair."}
+{"role":"user","content":"Let's go with JWT and refresh token rotation. Can you outline the implementation?"}
+{"role":"assistant","content":"Here's the plan: 1) Use the jose library for JWT signing/verification, 2) Store refresh tokens in the database with a family ID for rotation detection, 3) Access tokens carry user_id and roles in claims, 4) Middleware validates on every request. I'll start with the token service."}
diff --git a/benchmarks/fixtures/sessions/bug-fix.jsonl b/benchmarks/fixtures/sessions/bug-fix.jsonl
new file mode 100644
index 0000000..f57e9d0
--- /dev/null
+++ b/benchmarks/fixtures/sessions/bug-fix.jsonl
@@ -0,0 +1,6 @@
+{"role":"user","content":"We're getting a null pointer exception in the user profile handler. It crashes when the user has no avatar set."}
+{"role":"assistant","content":"I see the issue. The `get_avatar_url()` method returns a raw pointer without checking for null. In Rust, we should use Option instead. Let me look at the handler code."}
+{"role":"user","content":"Yes, the avatar field was added later and some users don't have it. How should we fix it?"}
+{"role":"assistant","content":"We need to: 1) Change the avatar field type from String to Option, 2) Add a null check in the handler using .unwrap_or_default(), 3) Update the database migration to allow NULL in the avatar column. The Option type will prevent this class of bug entirely."}
+{"role":"user","content":"That makes sense. Let's also add a default avatar URL for users without one."}
+{"role":"assistant","content":"Good idea. I'll add a DEFAULT_AVATAR_URL constant and use it in the unwrap_or. The handler will now be: `let avatar = user.avatar.unwrap_or_else(|| DEFAULT_AVATAR_URL.to_string())`. This ensures every response includes a valid avatar URL."}
diff --git a/benchmarks/fixtures/sessions/follow-up.jsonl b/benchmarks/fixtures/sessions/follow-up.jsonl
new file mode 100644
index 0000000..0d587b4
--- /dev/null
+++ b/benchmarks/fixtures/sessions/follow-up.jsonl
@@ -0,0 +1,4 @@
+{"role":"user","content":"Did the null pointer fix for avatars get deployed? Any issues?"}
+{"role":"assistant","content":"Yes, the fix was deployed yesterday. The Option change and default avatar URL are working correctly. We haven't seen any null pointer crashes since the deploy. The migration ran cleanly on all environments."}
+{"role":"user","content":"Great. Let's make sure we add similar null checks for the other optional profile fields."}
+{"role":"assistant","content":"I'll audit all profile fields. The bio, website, and location fields also need the Option treatment. I'll create a follow-up PR for those."}
diff --git a/benchmarks/fixtures/sessions/long-session.jsonl b/benchmarks/fixtures/sessions/long-session.jsonl
new file mode 100644
index 0000000..ff14262
--- /dev/null
+++ b/benchmarks/fixtures/sessions/long-session.jsonl
@@ -0,0 +1,30 @@
+{"role":"user","content":"Let's plan the architecture for our new notification service. What are the key components?"}
+{"role":"assistant","content":"The notification service needs: 1) An event ingestion API, 2) A routing engine that determines which channels to use (email, push, SMS), 3) A template system for message formatting, 4) Delivery adapters for each channel, 5) A preference store for user notification settings, 6) A retry queue for failed deliveries."}
+{"role":"user","content":"Should we use a message queue for async processing?"}
+{"role":"assistant","content":"Yes, definitely. I recommend RabbitMQ for this. It gives us reliable message delivery, dead letter queues for failed messages, and priority queues for urgent notifications. We can have separate queues per channel."}
+{"role":"user","content":"What about the template system? We need to support multiple languages."}
+{"role":"assistant","content":"I suggest using Handlebars templates stored in the database. Each template has a key, locale, and channel combination. We can fall back to the default locale if a translation is missing. The template service loads and caches them on startup."}
+{"role":"user","content":"How do we handle rate limiting for notifications?"}
+{"role":"assistant","content":"We need per-user rate limits to avoid spamming. I propose: max 10 push notifications per hour, max 5 emails per hour, max 2 SMS per day. We can store counters in Redis with TTL-based windows. Critical notifications (security alerts) bypass rate limits."}
+{"role":"user","content":"What about notification grouping? We don't want 50 individual notifications."}
+{"role":"assistant","content":"Good point. We should implement digest mode: collect notifications of the same type over a configurable window (e.g., 5 minutes), then send a single grouped notification. The grouping key would be notification_type + user_id."}
+{"role":"user","content":"How should we handle delivery failures?"}
+{"role":"assistant","content":"Three-tier retry strategy: 1) Immediate retry for transient errors (network timeout), 2) Exponential backoff for provider errors (1min, 5min, 30min), 3) Dead letter queue after 3 failures. We should also implement circuit breakers per delivery channel to avoid cascading failures."}
+{"role":"user","content":"Let's discuss the database schema for notification history."}
+{"role":"assistant","content":"I propose these tables: notifications (id, user_id, type, channel, status, created_at), notification_content (notification_id, subject, body, metadata), delivery_attempts (notification_id, attempt_number, status, error_message, attempted_at). We should partition the notifications table by created_at for efficient cleanup."}
+{"role":"user","content":"What about real-time notifications via WebSocket?"}
+{"role":"assistant","content":"We can add a WebSocket gateway that subscribes to a Redis pub/sub channel. When a notification is created, we publish to user:{user_id}:notifications. Connected clients receive it instantly. For offline users, the notification is stored and delivered on next connection via a catch-up query."}
+{"role":"user","content":"How do we test all of this?"}
+{"role":"assistant","content":"Testing strategy: 1) Unit tests for routing logic and template rendering, 2) Integration tests with test doubles for delivery adapters, 3) Contract tests for the API, 4) Load tests simulating 10K notifications/minute, 5) Chaos tests that kill delivery adapters to verify retry behavior."}
+{"role":"user","content":"What's the deployment strategy?"}
+{"role":"assistant","content":"I recommend deploying as three separate services: 1) Notification API (handles ingestion and preferences), 2) Notification Worker (processes queues, renders templates, delivers), 3) WebSocket Gateway (handles real-time connections). This lets us scale workers independently based on queue depth."}
+{"role":"user","content":"Should we add analytics for notification effectiveness?"}
+{"role":"assistant","content":"Yes. Track: delivery rate, open rate (for emails), click-through rate, unsubscribe rate, and time-to-read. Store events in a separate analytics table. We can use this data to optimize send times and channel selection per user."}
+{"role":"user","content":"What about A/B testing for notification content?"}
+{"role":"assistant","content":"We can extend the template system with variants. Each template can have A/B variants with a traffic split percentage. The routing engine assigns users to variants deterministically using a hash of user_id + experiment_id. Results feed into the analytics pipeline."}
+{"role":"user","content":"How do we handle GDPR compliance?"}
+{"role":"assistant","content":"Key requirements: 1) Honor unsubscribe requests within 24 hours, 2) Provide notification history export in user data requests, 3) Delete notification history on account deletion, 4) Store consent records with timestamps, 5) Never send marketing notifications without explicit opt-in. The preference store should track consent per notification category."}
+{"role":"user","content":"Let's summarize the key architecture decisions we've made."}
+{"role":"assistant","content":"Summary of architecture decisions: 1) RabbitMQ for async message processing with per-channel queues, 2) Handlebars templates with i18n support stored in database, 3) Redis-based rate limiting with per-user counters, 4) Digest mode with configurable grouping windows, 5) Three-tier retry with circuit breakers, 6) Partitioned notification history tables, 7) WebSocket gateway with Redis pub/sub for real-time, 8) Three-service deployment (API, Worker, Gateway), 9) Analytics pipeline for notification effectiveness, 10) A/B testing via template variants."}
+{"role":"user","content":"One more thing - what monitoring should we set up?"}
+{"role":"assistant","content":"Essential monitors: 1) Queue depth per channel (alert if > 1000), 2) Delivery success rate (alert if < 95%), 3) P99 delivery latency (alert if > 30s), 4) Dead letter queue size (alert if > 0), 5) WebSocket connection count, 6) Rate limit hit rate per user, 7) Template render errors. Use Prometheus metrics with Grafana dashboards."}
diff --git a/benchmarks/fixtures/sessions/session-a.jsonl b/benchmarks/fixtures/sessions/session-a.jsonl
new file mode 100644
index 0000000..baa27fb
--- /dev/null
+++ b/benchmarks/fixtures/sessions/session-a.jsonl
@@ -0,0 +1,6 @@
+{"role":"user","content":"We need to choose between Redis and PostgreSQL for our caching layer. What do you think?"}
+{"role":"assistant","content":"For a caching layer, Redis is the standard choice. It offers sub-millisecond reads, built-in TTL support, and pub/sub for cache invalidation. PostgreSQL could work with UNLOGGED tables but adds complexity."}
+{"role":"user","content":"What about consistency guarantees?"}
+{"role":"assistant","content":"Redis is eventually consistent in cluster mode, but for caching that's usually fine since we treat cache misses as normal. If you need strong consistency, you'd still read from PostgreSQL as the source of truth."}
+{"role":"user","content":"Let's go with Redis for caching. We'll keep PostgreSQL as the primary store."}
+{"role":"assistant","content":"Agreed. I'll set up Redis with a 15-minute default TTL and implement cache-aside pattern: read from Redis first, fall back to PostgreSQL, then populate the cache. We'll add cache invalidation on writes."}
diff --git a/benchmarks/fixtures/sessions/session-b.jsonl b/benchmarks/fixtures/sessions/session-b.jsonl
new file mode 100644
index 0000000..74413a2
--- /dev/null
+++ b/benchmarks/fixtures/sessions/session-b.jsonl
@@ -0,0 +1,6 @@
+{"role":"user","content":"How is the Redis caching performing in production?"}
+{"role":"assistant","content":"The results are excellent. We're seeing 95% cache hit rates on read-heavy endpoints. Average latency dropped from 45ms to 3ms for cached responses. The cache-aside pattern is working well."}
+{"role":"user","content":"Any issues with cache invalidation?"}
+{"role":"assistant","content":"We had one issue where batch updates weren't invalidating properly. Fixed it by adding a bulk invalidation method that uses Redis SCAN instead of individual DEL commands. Performance is now consistent."}
+{"role":"user","content":"What about memory usage?"}
+{"role":"assistant","content":"Redis is using about 2GB for our dataset, well within the 8GB instance. The 15-minute TTL keeps memory stable. Peak usage during high traffic only reaches 3.5GB. No eviction pressure."}
diff --git a/benchmarks/fixtures/sessions/session-c.jsonl b/benchmarks/fixtures/sessions/session-c.jsonl
new file mode 100644
index 0000000..7185fe6
--- /dev/null
+++ b/benchmarks/fixtures/sessions/session-c.jsonl
@@ -0,0 +1,2 @@
+{"role":"user","content":"Looking back at our Redis caching decision, was it the right call? The latency improvements seem significant."}
+{"role":"assistant","content":"Absolutely. The decision to use Redis for caching has paid off well. The 95% hit rate and 14x latency improvement (45ms to 3ms) validate the choice. The cache-aside pattern kept the implementation simple, and the 2GB memory footprint is very manageable. If we had gone with PostgreSQL UNLOGGED tables, we'd have seen maybe 2-3x improvement at best."}
diff --git a/benchmarks/fixtures/temporal-001.toml b/benchmarks/fixtures/temporal-001.toml
new file mode 100644
index 0000000..978c164
--- /dev/null
+++ b/benchmarks/fixtures/temporal-001.toml
@@ -0,0 +1,15 @@
+[[test]]
+id = "temporal-001"
+description = "Recall an architectural decision made in a prior session"
+setup = ["sessions/auth-decision.jsonl"]
+query = "what authentication approach did we decide on?"
+expected_contains = ["JWT", "token"]
+max_tokens = 500
+
+[[test]]
+id = "temporal-002"
+description = "Recall a specific bug fix from two sessions ago"
+setup = ["sessions/bug-fix.jsonl", "sessions/follow-up.jsonl"]
+query = "how did we fix the null pointer exception?"
+expected_contains = ["null check", "Option"]
+max_tokens = 400
diff --git a/benchmarks/scripts/download-locomo.sh b/benchmarks/scripts/download-locomo.sh
new file mode 100755
index 0000000..f788983
--- /dev/null
+++ b/benchmarks/scripts/download-locomo.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+set -euo pipefail
+DEST="${1:-locomo-data}"
+mkdir -p "$DEST"
+echo "Downloading LOCOMO dataset to $DEST ..."
+curl -L "https://snap-research.github.io/locomo/data/locomo_v1.zip" -o "$DEST/locomo_v1.zip"
+unzip -q "$DEST/locomo_v1.zip" -d "$DEST"
+echo "Done. Dataset at: $DEST"
+echo "NOTE: Verify license terms at https://snap-research.github.io/locomo/ before publishing scores."
diff --git a/crates/memory-bench/Cargo.toml b/crates/memory-bench/Cargo.toml
new file mode 100644
index 0000000..effb861
--- /dev/null
+++ b/crates/memory-bench/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "memory-bench"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[[bin]]
+name = "memory-bench"
+path = "src/main.rs"
+
+[dependencies]
+clap = { workspace = true }
+tokio = { workspace = true }
+serde = { workspace = true }
+serde_json = { workspace = true }
+toml = { workspace = true }
+thiserror = { workspace = true }
+anyhow = { workspace = true }
+tracing = { workspace = true }
+
+[dependencies.tempfile]
+workspace = true
+
+[dev-dependencies]
diff --git a/crates/memory-bench/src/baseline.rs b/crates/memory-bench/src/baseline.rs
new file mode 100644
index 0000000..4986dca
--- /dev/null
+++ b/crates/memory-bench/src/baseline.rs
@@ -0,0 +1,63 @@
+use anyhow::Result;
+use serde::Deserialize;
+use std::path::Path;
+
+/// Competitor baseline scores loaded from TOML.
+#[derive(Debug, Deserialize)]
+pub struct Baselines {
+ pub memmachine: Option,
+ pub mem0: Option,
+}
+
+/// Scores for a single competitor.
+#[derive(Debug, Deserialize)]
+pub struct CompetitorScore {
+ pub locomo_score: Option,
+ pub token_reduction: Option,
+ pub latency_improvement: Option,
+ pub accuracy_vs_openai_memory: Option,
+ pub latency_reduction: Option,
+}
+
+impl Baselines {
+ /// Load competitor baselines from a TOML file.
+ pub fn load(path: &Path) -> Result {
+ let content = std::fs::read_to_string(path)?;
+ Ok(toml::from_str(&content)?)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_baselines_load() {
+ let toml_content = r#"
+[memmachine]
+locomo_score = 0.91
+token_reduction = 0.80
+latency_improvement = 0.75
+
+[mem0]
+accuracy_vs_openai_memory = 0.26
+token_reduction = 0.90
+latency_reduction = 0.91
+"#;
+ let dir = tempfile::tempdir().unwrap();
+ let path = dir.path().join("baselines.toml");
+ std::fs::write(&path, toml_content).unwrap();
+
+ let baselines = Baselines::load(&path).unwrap();
+
+ let mm = baselines.memmachine.unwrap();
+ assert_eq!(mm.locomo_score, Some(0.91));
+ assert_eq!(mm.token_reduction, Some(0.80));
+ assert_eq!(mm.latency_improvement, Some(0.75));
+
+ let m0 = baselines.mem0.unwrap();
+ assert_eq!(m0.accuracy_vs_openai_memory, Some(0.26));
+ assert_eq!(m0.token_reduction, Some(0.90));
+ assert_eq!(m0.latency_reduction, Some(0.91));
+ }
+}
diff --git a/crates/memory-bench/src/cli.rs b/crates/memory-bench/src/cli.rs
new file mode 100644
index 0000000..e56de27
--- /dev/null
+++ b/crates/memory-bench/src/cli.rs
@@ -0,0 +1,75 @@
+use clap::{Parser, Subcommand};
+
+/// Benchmark suite for Agent Memory.
+#[derive(Parser)]
+#[command(name = "memory-bench", about = "Benchmark suite for Agent Memory")]
+pub struct Cli {
+ #[command(subcommand)]
+ pub command: Commands,
+
+ /// Path to memory binary (default: searches PATH).
+ #[arg(long, global = true, default_value = "memory")]
+ pub memory_bin: String,
+}
+
+/// Available benchmark subcommands.
+#[derive(Subcommand)]
+pub enum Commands {
+ /// Run temporal recall benchmarks.
+ Temporal {
+ /// Path to fixtures directory.
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ /// Output file for JSON results.
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run multi-session reasoning benchmarks.
+ Multisession {
+ /// Path to fixtures directory.
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ /// Output file for JSON results.
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run compression efficiency benchmarks.
+ Compression {
+ /// Path to fixtures directory.
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ /// Output file for JSON results.
+ #[arg(long)]
+ output: Option,
+ },
+ /// Run full custom benchmark suite (all categories).
+ All {
+ /// Path to fixtures directory.
+ #[arg(long, default_value = "benchmarks/fixtures")]
+ fixtures: String,
+ /// Output file for JSON results.
+ #[arg(long)]
+ output: Option,
+ /// Compare against competitor baselines.
+ #[arg(long)]
+ compare: bool,
+ /// Path to baselines TOML file.
+ #[arg(long, default_value = "benchmarks/baselines.toml")]
+ baselines: String,
+ },
+ /// Run LOCOMO adapter benchmark.
+ Locomo {
+ /// Path to LOCOMO dataset directory.
+ #[arg(long)]
+ dataset: String,
+ /// Output file for JSON results.
+ #[arg(long)]
+ output: Option,
+ /// Compare against competitor baselines.
+ #[arg(long)]
+ compare: bool,
+ /// Path to baselines TOML file.
+ #[arg(long, default_value = "benchmarks/baselines.toml")]
+ baselines: String,
+ },
+}
diff --git a/crates/memory-bench/src/fixture.rs b/crates/memory-bench/src/fixture.rs
new file mode 100644
index 0000000..5913daf
--- /dev/null
+++ b/crates/memory-bench/src/fixture.rs
@@ -0,0 +1,216 @@
+use anyhow::{bail, Result};
+use serde::{Deserialize, Serialize};
+use std::path::Path;
+
+/// A collection of test cases loaded from a TOML fixture file.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct Fixture {
+ #[serde(rename = "test")]
+ pub tests: Vec,
+}
+
+/// A single benchmark test case.
+#[derive(Debug, Deserialize, Serialize, Clone)]
+pub struct TestCase {
+ /// Unique identifier for this test case.
+ pub id: String,
+ /// Human-readable description of what the test verifies.
+ pub description: String,
+ /// Paths to JSONL session files to ingest before running the query.
+ pub setup: Vec,
+ /// The query to run against the memory system.
+ pub query: String,
+ /// Case-insensitive substrings that the response should contain.
+ pub expected_contains: Vec,
+ /// Maximum token budget for the response.
+ pub max_tokens: usize,
+}
+
+impl Fixture {
+ /// Load and validate a fixture from a TOML file.
+ pub fn load(path: &Path) -> Result {
+ let content = std::fs::read_to_string(path)?;
+ let fixture: Fixture = toml::from_str(&content)?;
+
+ for tc in &fixture.tests {
+ if tc.id.is_empty() {
+ bail!("Test case has empty id in {}", path.display());
+ }
+ if tc.query.is_empty() {
+ bail!(
+ "Test case '{}' has empty query in {}",
+ tc.id,
+ path.display()
+ );
+ }
+ }
+
+ Ok(fixture)
+ }
+
+ /// Load all `.toml` fixture files from a directory and collect their test cases.
+ pub fn load_dir(dir: &Path) -> Result> {
+ let mut all_tests = Vec::new();
+
+ let mut entries: Vec<_> = std::fs::read_dir(dir)?
+ .filter_map(|e| e.ok())
+ .filter(|e| e.path().extension().is_some_and(|ext| ext == "toml"))
+ .collect();
+
+ entries.sort_by_key(|e| e.path());
+
+ for entry in entries {
+ let fixture = Self::load(&entry.path())?;
+ all_tests.extend(fixture.tests);
+ }
+
+ Ok(all_tests)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use std::io::Write;
+
+ #[test]
+ fn test_fixture_parses_valid_toml() {
+ let dir = tempfile::tempdir().unwrap();
+ let path = dir.path().join("test.toml");
+ let mut f = std::fs::File::create(&path).unwrap();
+ write!(
+ f,
+ r#"
+[[test]]
+id = "t-001"
+description = "recall a decision"
+setup = ["sessions/auth.jsonl"]
+query = "what auth did we pick?"
+expected_contains = ["JWT"]
+max_tokens = 500
+
+[[test]]
+id = "t-002"
+description = "recall a bug fix"
+setup = ["sessions/bug.jsonl"]
+query = "how was the bug fixed?"
+expected_contains = ["Option"]
+max_tokens = 400
+"#
+ )
+ .unwrap();
+
+ let fixture = Fixture::load(&path).unwrap();
+ assert_eq!(fixture.tests.len(), 2);
+ assert_eq!(fixture.tests[0].id, "t-001");
+ assert_eq!(fixture.tests[1].id, "t-002");
+ assert_eq!(fixture.tests[0].max_tokens, 500);
+ }
+
+ #[test]
+ fn test_fixture_validates_empty_id() {
+ let dir = tempfile::tempdir().unwrap();
+ let path = dir.path().join("bad.toml");
+ let mut f = std::fs::File::create(&path).unwrap();
+ write!(
+ f,
+ r#"
+[[test]]
+id = ""
+description = "bad test"
+setup = []
+query = "something"
+expected_contains = []
+max_tokens = 100
+"#
+ )
+ .unwrap();
+
+ let result = Fixture::load(&path);
+ assert!(result.is_err());
+ assert!(
+ result.unwrap_err().to_string().contains("empty id"),
+ "Error should mention empty id"
+ );
+ }
+
+ #[test]
+ fn test_fixture_validates_empty_query() {
+ let dir = tempfile::tempdir().unwrap();
+ let path = dir.path().join("bad.toml");
+ let mut f = std::fs::File::create(&path).unwrap();
+ write!(
+ f,
+ r#"
+[[test]]
+id = "t-001"
+description = "bad test"
+setup = []
+query = ""
+expected_contains = []
+max_tokens = 100
+"#
+ )
+ .unwrap();
+
+ let result = Fixture::load(&path);
+ assert!(result.is_err());
+ assert!(
+ result.unwrap_err().to_string().contains("empty query"),
+ "Error should mention empty query"
+ );
+ }
+
+ #[test]
+ fn test_load_dir_collects_all_fixtures() {
+ let dir = tempfile::tempdir().unwrap();
+
+ // Create first fixture file
+ let path1 = dir.path().join("a.toml");
+ let mut f1 = std::fs::File::create(&path1).unwrap();
+ write!(
+ f1,
+ r#"
+[[test]]
+id = "a-001"
+description = "test a"
+setup = []
+query = "query a"
+expected_contains = []
+max_tokens = 100
+"#
+ )
+ .unwrap();
+
+ // Create second fixture file
+ let path2 = dir.path().join("b.toml");
+ let mut f2 = std::fs::File::create(&path2).unwrap();
+ write!(
+ f2,
+ r#"
+[[test]]
+id = "b-001"
+description = "test b1"
+setup = []
+query = "query b1"
+expected_contains = []
+max_tokens = 200
+
+[[test]]
+id = "b-002"
+description = "test b2"
+setup = []
+query = "query b2"
+expected_contains = []
+max_tokens = 300
+"#
+ )
+ .unwrap();
+
+ let tests = Fixture::load_dir(dir.path()).unwrap();
+ assert_eq!(tests.len(), 3);
+ assert_eq!(tests[0].id, "a-001");
+ assert_eq!(tests[1].id, "b-001");
+ assert_eq!(tests[2].id, "b-002");
+ }
+}
diff --git a/crates/memory-bench/src/lib.rs b/crates/memory-bench/src/lib.rs
new file mode 100644
index 0000000..81083fd
--- /dev/null
+++ b/crates/memory-bench/src/lib.rs
@@ -0,0 +1,6 @@
+pub mod baseline;
+pub mod fixture;
+pub mod locomo;
+pub mod report;
+pub mod runner;
+pub mod scorer;
diff --git a/crates/memory-bench/src/locomo.rs b/crates/memory-bench/src/locomo.rs
new file mode 100644
index 0000000..7499eaf
--- /dev/null
+++ b/crates/memory-bench/src/locomo.rs
@@ -0,0 +1,395 @@
+//! LOCOMO dataset adapter for benchmark evaluation.
+//!
+//! Loads conversations from the Snap Research LOCOMO dataset format,
+//! scores answers against gold-standard questions, and aggregates
+//! results with per-question-type breakdowns (single_hop, multi_hop,
+//! temporal, open_domain).
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::path::Path;
+
+use anyhow::Result;
+
+/// A single LOCOMO conversation containing turns and evaluation questions.
+#[derive(Debug, Deserialize, Clone)]
+pub struct LocomoConversation {
+ /// Unique identifier for the conversation.
+ pub conversation_id: String,
+ /// Ordered dialogue turns.
+ pub turns: Vec,
+ /// Gold-standard questions for evaluation.
+ pub questions: Vec,
+}
+
+/// A single dialogue turn in a LOCOMO conversation.
+#[derive(Debug, Deserialize, Clone)]
+pub struct Turn {
+ /// Speaker role (e.g., "user", "assistant").
+ pub role: String,
+ /// Text content of the turn.
+ pub content: String,
+}
+
+/// A gold-standard evaluation question with expected answer and type.
+#[derive(Debug, Deserialize, Clone)]
+pub struct Question {
+ /// The question text.
+ pub question: String,
+ /// The expected gold-standard answer.
+ pub answer: String,
+ /// Question type: single_hop, multi_hop, temporal, or open_domain.
+ #[serde(rename = "type")]
+ pub question_type: String,
+}
+
+/// Evaluation result for a single conversation.
+#[derive(Debug, Serialize, Clone)]
+pub struct LocomoResult {
+ /// Conversation identifier.
+ pub conversation_id: String,
+ /// Total number of questions evaluated.
+ pub total_questions: usize,
+ /// Number of correct answers.
+ pub correct: usize,
+ /// Overall score (correct / total).
+ pub score: f64,
+ /// Scores broken down by question type.
+ pub by_type: HashMap,
+}
+
+/// Score for a specific question type.
+#[derive(Debug, Serialize, Clone)]
+pub struct TypeScore {
+ /// Total questions of this type.
+ pub total: usize,
+ /// Correct answers of this type.
+ pub correct: usize,
+ /// Score for this type (correct / total).
+ pub score: f64,
+}
+
+/// Aggregate result across all conversations.
+#[derive(Debug, Serialize)]
+pub struct LocomoAggregateResult {
+ /// Number of conversations evaluated.
+ pub conversations: usize,
+ /// Total questions across all conversations.
+ pub total_questions: usize,
+ /// Overall score across all conversations.
+ pub overall_score: f64,
+ /// Aggregated scores by question type.
+ pub by_type: HashMap,
+ /// Per-conversation results.
+ pub per_conversation: Vec,
+}
+
+/// Load all LOCOMO conversations from a dataset directory.
+///
+/// Reads all `.json` files in the directory and deserializes them
+/// into `LocomoConversation` structs.
+pub fn load_dataset(dir: &Path) -> Result> {
+ let mut conversations = Vec::new();
+ for entry in std::fs::read_dir(dir)? {
+ let path = entry?.path();
+ if path.extension().map(|e| e == "json").unwrap_or(false) {
+ let content = std::fs::read_to_string(&path)?;
+ let conv: LocomoConversation = serde_json::from_str(&content)?;
+ conversations.push(conv);
+ }
+ }
+ Ok(conversations)
+}
+
+/// Score a single conversation's questions against retrieved answers.
+///
+/// Uses case-insensitive substring matching: an answer is correct if
+/// the retrieved text contains the gold answer (case-insensitive).
+pub fn score_conversation(conv: &LocomoConversation, answers: &[String]) -> LocomoResult {
+ let mut by_type: HashMap = HashMap::new();
+ let mut total_correct = 0;
+
+ for (i, q) in conv.questions.iter().enumerate() {
+ let answer = answers.get(i).map(|s| s.as_str()).unwrap_or("");
+ let is_correct = answer.to_lowercase().contains(&q.answer.to_lowercase());
+
+ if is_correct {
+ total_correct += 1;
+ }
+
+ let entry = by_type.entry(q.question_type.clone()).or_insert((0, 0));
+ entry.0 += 1; // total
+ if is_correct {
+ entry.1 += 1; // correct
+ }
+ }
+
+ let total = conv.questions.len();
+ let score = if total == 0 {
+ 0.0
+ } else {
+ total_correct as f64 / total as f64
+ };
+
+ let by_type = by_type
+ .into_iter()
+ .map(|(k, (t, c))| {
+ let s = if t == 0 { 0.0 } else { c as f64 / t as f64 };
+ (
+ k,
+ TypeScore {
+ total: t,
+ correct: c,
+ score: s,
+ },
+ )
+ })
+ .collect();
+
+ LocomoResult {
+ conversation_id: conv.conversation_id.clone(),
+ total_questions: total,
+ correct: total_correct,
+ score,
+ by_type,
+ }
+}
+
+/// Aggregate results across all conversations.
+///
+/// Computes overall totals, correct counts, and per-type breakdowns.
+pub fn aggregate_results(results: &[LocomoResult]) -> LocomoAggregateResult {
+ let mut total_questions = 0;
+ let mut total_correct = 0;
+ let mut by_type: HashMap = HashMap::new();
+
+ for r in results {
+ total_questions += r.total_questions;
+ total_correct += r.correct;
+ for (k, ts) in &r.by_type {
+ let entry = by_type.entry(k.clone()).or_insert((0, 0));
+ entry.0 += ts.total;
+ entry.1 += ts.correct;
+ }
+ }
+
+ let overall_score = if total_questions == 0 {
+ 0.0
+ } else {
+ total_correct as f64 / total_questions as f64
+ };
+
+ let by_type = by_type
+ .into_iter()
+ .map(|(k, (t, c))| {
+ let s = if t == 0 { 0.0 } else { c as f64 / t as f64 };
+ (
+ k,
+ TypeScore {
+ total: t,
+ correct: c,
+ score: s,
+ },
+ )
+ })
+ .collect();
+
+ LocomoAggregateResult {
+ conversations: results.len(),
+ total_questions,
+ overall_score,
+ by_type,
+ per_conversation: results.to_vec(),
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_locomo_conversation_parses() {
+ let json = r#"{
+ "conversation_id": "conv-001",
+ "turns": [
+ {"role": "user", "content": "Hello"},
+ {"role": "assistant", "content": "Hi there"}
+ ],
+ "questions": [
+ {"question": "What did the user say?", "answer": "Hello", "type": "single_hop"}
+ ]
+ }"#;
+ let conv: LocomoConversation = serde_json::from_str(json).unwrap();
+ assert_eq!(conv.conversation_id, "conv-001");
+ assert_eq!(conv.turns.len(), 2);
+ assert_eq!(conv.questions.len(), 1);
+ assert_eq!(conv.questions[0].question_type, "single_hop");
+ }
+
+ #[test]
+ fn test_locomo_conversation_multiple_types() {
+ let json = r#"{
+ "conversation_id": "conv-002",
+ "turns": [{"role": "user", "content": "test"}],
+ "questions": [
+ {"question": "q1", "answer": "a1", "type": "single_hop"},
+ {"question": "q2", "answer": "a2", "type": "multi_hop"},
+ {"question": "q3", "answer": "a3", "type": "temporal"},
+ {"question": "q4", "answer": "a4", "type": "open_domain"}
+ ]
+ }"#;
+ let conv: LocomoConversation = serde_json::from_str(json).unwrap();
+ assert_eq!(conv.questions.len(), 4);
+ let types: Vec<&str> = conv
+ .questions
+ .iter()
+ .map(|q| q.question_type.as_str())
+ .collect();
+ assert!(types.contains(&"single_hop"));
+ assert!(types.contains(&"multi_hop"));
+ assert!(types.contains(&"temporal"));
+ assert!(types.contains(&"open_domain"));
+ }
+
+ #[test]
+ fn test_score_conversation_all_correct() {
+ let conv = LocomoConversation {
+ conversation_id: "test".to_string(),
+ turns: vec![],
+ questions: vec![
+ Question {
+ question: "q1".into(),
+ answer: "alpha".into(),
+ question_type: "single_hop".into(),
+ },
+ Question {
+ question: "q2".into(),
+ answer: "beta".into(),
+ question_type: "multi_hop".into(),
+ },
+ ],
+ };
+ let answers = vec![
+ "The answer is Alpha obviously".to_string(),
+ "It was beta all along".to_string(),
+ ];
+ let result = score_conversation(&conv, &answers);
+ assert!((result.score - 1.0).abs() < f64::EPSILON);
+ assert_eq!(result.correct, 2);
+ assert_eq!(result.total_questions, 2);
+ }
+
+ #[test]
+ fn test_score_conversation_partial() {
+ let conv = LocomoConversation {
+ conversation_id: "test".to_string(),
+ turns: vec![],
+ questions: vec![
+ Question {
+ question: "q1".into(),
+ answer: "alpha".into(),
+ question_type: "single_hop".into(),
+ },
+ Question {
+ question: "q2".into(),
+ answer: "beta".into(),
+ question_type: "single_hop".into(),
+ },
+ Question {
+ question: "q3".into(),
+ answer: "gamma".into(),
+ question_type: "temporal".into(),
+ },
+ Question {
+ question: "q4".into(),
+ answer: "delta".into(),
+ question_type: "temporal".into(),
+ },
+ ],
+ };
+ let answers = vec![
+ "alpha is here".to_string(),
+ "no match".to_string(),
+ "gamma found".to_string(),
+ "wrong answer".to_string(),
+ ];
+ let result = score_conversation(&conv, &answers);
+ assert!((result.score - 0.5).abs() < f64::EPSILON);
+ assert_eq!(result.correct, 2);
+ }
+
+ #[test]
+ fn test_aggregate_results() {
+ let r1 = LocomoResult {
+ conversation_id: "c1".into(),
+ total_questions: 4,
+ correct: 3,
+ score: 0.75,
+ by_type: HashMap::from([
+ (
+ "single_hop".into(),
+ TypeScore {
+ total: 2,
+ correct: 2,
+ score: 1.0,
+ },
+ ),
+ (
+ "temporal".into(),
+ TypeScore {
+ total: 2,
+ correct: 1,
+ score: 0.5,
+ },
+ ),
+ ]),
+ };
+ let r2 = LocomoResult {
+ conversation_id: "c2".into(),
+ total_questions: 2,
+ correct: 1,
+ score: 0.5,
+ by_type: HashMap::from([
+ (
+ "single_hop".into(),
+ TypeScore {
+ total: 1,
+ correct: 0,
+ score: 0.0,
+ },
+ ),
+ (
+ "temporal".into(),
+ TypeScore {
+ total: 1,
+ correct: 1,
+ score: 1.0,
+ },
+ ),
+ ]),
+ };
+ let agg = aggregate_results(&[r1, r2]);
+ assert_eq!(agg.conversations, 2);
+ assert_eq!(agg.total_questions, 6);
+ assert_eq!(agg.by_type["single_hop"].total, 3);
+ assert_eq!(agg.by_type["single_hop"].correct, 2);
+ assert_eq!(agg.by_type["temporal"].total, 3);
+ assert_eq!(agg.by_type["temporal"].correct, 2);
+ // overall: 4/6
+ assert!((agg.overall_score - 4.0 / 6.0).abs() < 0.001);
+ }
+
+ #[test]
+ fn test_load_dataset_from_dir() {
+ let dir = tempfile::tempdir().unwrap();
+ let conv1 = r#"{"conversation_id":"c1","turns":[{"role":"user","content":"hi"}],"questions":[{"question":"q","answer":"a","type":"single_hop"}]}"#;
+ let conv2 = r#"{"conversation_id":"c2","turns":[{"role":"user","content":"bye"}],"questions":[{"question":"q2","answer":"a2","type":"temporal"}]}"#;
+ std::fs::write(dir.path().join("conv1.json"), conv1).unwrap();
+ std::fs::write(dir.path().join("conv2.json"), conv2).unwrap();
+ // Non-json file should be ignored
+ std::fs::write(dir.path().join("readme.txt"), "ignore me").unwrap();
+
+ let convs = load_dataset(dir.path()).unwrap();
+ assert_eq!(convs.len(), 2);
+ }
+}
diff --git a/crates/memory-bench/src/main.rs b/crates/memory-bench/src/main.rs
new file mode 100644
index 0000000..860f525
--- /dev/null
+++ b/crates/memory-bench/src/main.rs
@@ -0,0 +1,194 @@
+use clap::Parser;
+
+mod cli;
+
+use memory_bench::{baseline, fixture, locomo, report, runner, scorer};
+use scorer::BenchmarkReport;
+
+fn main() -> anyhow::Result<()> {
+ let cli = cli::Cli::parse();
+ let config = runner::RunConfig {
+ memory_bin: cli.memory_bin.clone(),
+ };
+
+ match cli.command {
+ cli::Commands::Temporal { fixtures, output } => {
+ let report = run_category("temporal", &fixtures, &config)?;
+ print_report(&report, output.as_deref())?;
+ }
+ cli::Commands::Multisession { fixtures, output } => {
+ let report = run_category("multi", &fixtures, &config)?;
+ print_report(&report, output.as_deref())?;
+ }
+ cli::Commands::Compression { fixtures, output } => {
+ let report = run_category("compress", &fixtures, &config)?;
+ print_report(&report, output.as_deref())?;
+ }
+ cli::Commands::All {
+ fixtures,
+ output,
+ compare,
+ baselines,
+ } => {
+ let bench_report = run_all(&fixtures, &config)?;
+ let baselines_data = if compare {
+ Some(baseline::Baselines::load(std::path::Path::new(&baselines))?)
+ } else {
+ None
+ };
+ let json = report::to_json(&bench_report);
+ let md = report::to_markdown(&bench_report, baselines_data.as_ref());
+ println!("{md}");
+ if let Some(path) = output {
+ std::fs::write(&path, &json)?;
+ eprintln!("Results written to {path}");
+ }
+ }
+ cli::Commands::Locomo {
+ dataset,
+ output,
+ compare,
+ baselines,
+ } => {
+ let conversations = locomo::load_dataset(std::path::Path::new(&dataset))?;
+ eprintln!(
+ "Loaded {} conversations from {}",
+ conversations.len(),
+ dataset
+ );
+
+ let mut results = Vec::new();
+ for conv in &conversations {
+ // Convert turns to JSONL and ingest via runner
+ let temp_dir = tempfile::tempdir()?;
+ let session_path = temp_dir.path().join("session.jsonl");
+ let mut lines = Vec::new();
+ for turn in &conv.turns {
+ lines.push(format!(
+ "{{\"role\":\"{}\",\"content\":\"{}\"}}",
+ turn.role,
+ turn.content.replace('\\', "\\\\").replace('"', "\\\"")
+ ));
+ }
+ std::fs::write(&session_path, lines.join("\n"))?;
+ let _ = runner::ingest_session(session_path.to_str().unwrap_or_default(), &config);
+
+ // Run each question through runner and collect answers
+ let mut answers = Vec::new();
+ for q in &conv.questions {
+ let result = runner::run_query(&q.question, &config);
+ answers.push(result.raw_output);
+ }
+
+ let result = locomo::score_conversation(conv, &answers);
+ results.push(result);
+ }
+
+ let aggregate = locomo::aggregate_results(&results);
+
+ if compare {
+ let _baselines_data = baseline::Baselines::load(std::path::Path::new(&baselines))?;
+ eprintln!("Loaded baselines for comparison");
+ }
+
+ let json = serde_json::to_string_pretty(&aggregate)?;
+ println!("{json}");
+ if let Some(path) = output {
+ std::fs::write(&path, &json)?;
+ eprintln!("Results written to {path}");
+ }
+ }
+ }
+ Ok(())
+}
+
+/// Run benchmarks for a single category by filtering test case IDs by prefix.
+fn run_category(
+ category_prefix: &str,
+ fixtures_dir: &str,
+ config: &runner::RunConfig,
+) -> anyhow::Result {
+ let all_tests = fixture::Fixture::load_dir(std::path::Path::new(fixtures_dir))?;
+ let tests: Vec<_> = all_tests
+ .into_iter()
+ .filter(|t| t.id.starts_with(category_prefix))
+ .collect();
+
+ run_tests(&tests, config)
+}
+
+/// Run all benchmark categories and aggregate into one report.
+fn run_all(fixtures_dir: &str, config: &runner::RunConfig) -> anyhow::Result {
+ let tests = fixture::Fixture::load_dir(std::path::Path::new(fixtures_dir))?;
+ run_tests(&tests, config)
+}
+
+/// Execute a set of test cases and produce a benchmark report.
+fn run_tests(
+ tests: &[fixture::TestCase],
+ config: &runner::RunConfig,
+) -> anyhow::Result {
+ let mut hits = Vec::new();
+ let mut latencies = Vec::new();
+ let mut total_tokens = 0usize;
+ let mut compression_ratios = Vec::new();
+
+ for test in tests {
+ // Ingest setup session files
+ for setup_path in &test.setup {
+ let _ = runner::ingest_session(setup_path, config);
+ }
+
+ // Run the query
+ let result = runner::run_query(&test.query, config);
+ let hit = scorer::score_result(&result.raw_output, &test.expected_contains);
+ hits.push(hit);
+ latencies.push(result.latency_ms);
+ total_tokens += result.tokens_estimated;
+
+ // Compute compression ratio
+ let raw_tokens = scorer::estimate_raw_tokens(&test.setup);
+ if raw_tokens > 0 {
+ let ratio = scorer::compute_compression_ratio(result.tokens_estimated, raw_tokens);
+ compression_ratios.push(ratio);
+ }
+ }
+
+ latencies.sort();
+
+ let test_count = tests.len();
+ let pass_count = hits.iter().filter(|&&h| h).count();
+ let accuracy = scorer::compute_accuracy(&hits);
+ let recall_at_5 = scorer::compute_recall_at_k(&hits, test_count);
+ let token_usage_avg = total_tokens.checked_div(test_count).unwrap_or(0);
+ let latency_p50_ms = scorer::percentile(&latencies, 50.0);
+ let latency_p95_ms = scorer::percentile(&latencies, 95.0);
+ let compression_ratio = if compression_ratios.is_empty() {
+ 0.0
+ } else {
+ compression_ratios.iter().sum::() / compression_ratios.len() as f64
+ };
+
+ Ok(BenchmarkReport {
+ accuracy,
+ recall_at_5,
+ token_usage_avg,
+ latency_p50_ms,
+ latency_p95_ms,
+ compression_ratio,
+ test_count,
+ pass_count,
+ })
+}
+
+/// Print a report as markdown to stdout and optionally write JSON to file.
+fn print_report(bench_report: &BenchmarkReport, output: Option<&str>) -> anyhow::Result<()> {
+ let md = report::to_markdown(bench_report, None);
+ println!("{md}");
+ if let Some(path) = output {
+ let json = report::to_json(bench_report);
+ std::fs::write(path, &json)?;
+ eprintln!("Results written to {path}");
+ }
+ Ok(())
+}
diff --git a/crates/memory-bench/src/report.rs b/crates/memory-bench/src/report.rs
new file mode 100644
index 0000000..130960c
--- /dev/null
+++ b/crates/memory-bench/src/report.rs
@@ -0,0 +1,114 @@
+use crate::baseline::Baselines;
+use crate::scorer::BenchmarkReport;
+
+/// Serialize a benchmark report to pretty-printed JSON.
+pub fn to_json(report: &BenchmarkReport) -> String {
+ serde_json::to_string_pretty(report).unwrap_or_default()
+}
+
+/// Generate a markdown table from a benchmark report, optionally including competitor baselines.
+pub fn to_markdown(report: &BenchmarkReport, baselines: Option<&Baselines>) -> String {
+ let mut out = String::new();
+ out.push_str("# Benchmark Results\n\n");
+
+ if let Some(bl) = baselines {
+ out.push_str("| Metric | Agent-Memory | MemMachine | Mem0 |\n");
+ out.push_str("|--------|-------------|------------|------|\n");
+
+ let mm = bl.memmachine.as_ref();
+ let m0 = bl.mem0.as_ref();
+
+ out.push_str(&format!(
+ "| Accuracy | {:.1}% | {} | {} |\n",
+ report.accuracy * 100.0,
+ mm.and_then(|m| m.locomo_score)
+ .map_or("-".to_string(), |v| format!("{:.1}%", v * 100.0)),
+ m0.and_then(|m| m.accuracy_vs_openai_memory)
+ .map_or("-".to_string(), |v| format!("+{:.0}%", v * 100.0)),
+ ));
+ out.push_str(&format!(
+ "| Recall@5 | {:.1}% | - | - |\n",
+ report.recall_at_5 * 100.0
+ ));
+ out.push_str(&format!(
+ "| Avg Tokens | {} | - | - |\n",
+ report.token_usage_avg
+ ));
+ out.push_str(&format!(
+ "| Latency p50 | {}ms | {} | {} |\n",
+ report.latency_p50_ms,
+ mm.and_then(|m| m.latency_improvement)
+ .map_or("-".to_string(), |v| format!("{:.0}% faster", v * 100.0)),
+ m0.and_then(|m| m.latency_reduction)
+ .map_or("-".to_string(), |v| format!("{:.0}% reduction", v * 100.0)),
+ ));
+ out.push_str(&format!(
+ "| Latency p95 | {}ms | - | - |\n",
+ report.latency_p95_ms
+ ));
+ out.push_str(&format!(
+ "| Compression | {:.1}% | {} | {} |\n",
+ report.compression_ratio * 100.0,
+ mm.and_then(|m| m.token_reduction)
+ .map_or("-".to_string(), |v| format!("{:.0}%", v * 100.0)),
+ m0.and_then(|m| m.token_reduction)
+ .map_or("-".to_string(), |v| format!("{:.0}%", v * 100.0)),
+ ));
+ } else {
+ out.push_str("| Metric | Value |\n");
+ out.push_str("|--------|-------|\n");
+ out.push_str(&format!("| Accuracy | {:.1}% |\n", report.accuracy * 100.0));
+ out.push_str(&format!(
+ "| Recall@5 | {:.1}% |\n",
+ report.recall_at_5 * 100.0
+ ));
+ out.push_str(&format!("| Avg Tokens | {} |\n", report.token_usage_avg));
+ out.push_str(&format!("| Latency p50 | {}ms |\n", report.latency_p50_ms));
+ out.push_str(&format!("| Latency p95 | {}ms |\n", report.latency_p95_ms));
+ out.push_str(&format!(
+ "| Compression | {:.1}% |\n",
+ report.compression_ratio * 100.0
+ ));
+ }
+
+ out.push_str(&format!(
+ "\n**Tests:** {}/{} passed\n",
+ report.pass_count, report.test_count
+ ));
+
+ out
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ fn sample_report() -> BenchmarkReport {
+ BenchmarkReport {
+ accuracy: 0.85,
+ recall_at_5: 0.70,
+ token_usage_avg: 300,
+ latency_p50_ms: 45,
+ latency_p95_ms: 120,
+ compression_ratio: 0.75,
+ test_count: 10,
+ pass_count: 8,
+ }
+ }
+
+ #[test]
+ fn test_to_json_roundtrips() {
+ let report = sample_report();
+ let json = to_json(&report);
+ let parsed: BenchmarkReport = serde_json::from_str(&json).unwrap();
+ assert_eq!(parsed, report);
+ }
+
+ #[test]
+ fn test_to_markdown_contains_headers() {
+ let report = sample_report();
+ let md = to_markdown(&report, None);
+ assert!(md.contains("Accuracy"), "Should contain Accuracy header");
+ assert!(md.contains("Recall@5"), "Should contain Recall@5 header");
+ }
+}
diff --git a/crates/memory-bench/src/runner.rs b/crates/memory-bench/src/runner.rs
new file mode 100644
index 0000000..6b2a5ed
--- /dev/null
+++ b/crates/memory-bench/src/runner.rs
@@ -0,0 +1,86 @@
+use std::io::BufRead;
+use std::process::Command;
+use std::time::Instant;
+
+/// Configuration for the benchmark runner.
+pub struct RunConfig {
+ /// Path to the memory binary (default: "memory").
+ pub memory_bin: String,
+}
+
+impl Default for RunConfig {
+ fn default() -> Self {
+ Self {
+ memory_bin: "memory".to_string(),
+ }
+ }
+}
+
+/// Result of running a single query against the memory binary.
+pub struct QueryResult {
+ /// Raw stdout output from the binary.
+ pub raw_output: String,
+ /// Elapsed time in milliseconds.
+ pub latency_ms: u64,
+ /// Token count from meta.tokens_estimated in JSON envelope.
+ pub tokens_estimated: usize,
+ /// Whether the command exited successfully.
+ pub success: bool,
+}
+
+/// Run a search query against the memory binary and capture JSON output + latency.
+pub fn run_query(query: &str, config: &RunConfig) -> QueryResult {
+ let start = Instant::now();
+ let output = Command::new(&config.memory_bin)
+ .args(["search", query, "--format=json"])
+ .output();
+
+ let elapsed = start.elapsed().as_millis() as u64;
+
+ match output {
+ Ok(out) => {
+ let raw_output = String::from_utf8_lossy(&out.stdout).to_string();
+ let tokens_estimated = extract_tokens_estimated(&raw_output);
+ QueryResult {
+ raw_output,
+ latency_ms: elapsed,
+ tokens_estimated,
+ success: out.status.success(),
+ }
+ }
+ Err(_) => QueryResult {
+ raw_output: String::new(),
+ latency_ms: elapsed,
+ tokens_estimated: 0,
+ success: false,
+ },
+ }
+}
+
+/// Extract meta.tokens_estimated from JSON envelope output.
+fn extract_tokens_estimated(json_output: &str) -> usize {
+ serde_json::from_str::(json_output)
+ .ok()
+ .and_then(|v| v.get("meta")?.get("tokens_estimated")?.as_u64())
+ .unwrap_or(0) as usize
+}
+
+/// Ingest a JSONL session file by calling `memory add` for each line.
+pub fn ingest_session(session_path: &str, config: &RunConfig) -> anyhow::Result<()> {
+ let file = std::fs::File::open(session_path)?;
+ let reader = std::io::BufReader::new(file);
+
+ for line in reader.lines() {
+ let line = line?;
+ let trimmed = line.trim();
+ if trimmed.is_empty() {
+ continue;
+ }
+
+ let _ = Command::new(&config.memory_bin)
+ .args(["add", "--content", trimmed, "--kind", "episodic"])
+ .output();
+ }
+
+ Ok(())
+}
diff --git a/crates/memory-bench/src/scorer.rs b/crates/memory-bench/src/scorer.rs
new file mode 100644
index 0000000..e615742
--- /dev/null
+++ b/crates/memory-bench/src/scorer.rs
@@ -0,0 +1,142 @@
+/// Returns true if result text contains at least one expected string (case-insensitive).
+pub fn score_result(result: &str, expected_contains: &[String]) -> bool {
+ let lower = result.to_lowercase();
+ expected_contains
+ .iter()
+ .any(|e| lower.contains(&e.to_lowercase()))
+}
+
+/// Compute accuracy as fraction of hits that are true.
+pub fn compute_accuracy(hits: &[bool]) -> f64 {
+ if hits.is_empty() {
+ return 0.0;
+ }
+ hits.iter().filter(|&&h| h).count() as f64 / hits.len() as f64
+}
+
+/// Compute recall@k: fraction of relevant items found in top-k results.
+pub fn compute_recall_at_k(hits_in_top_k: &[bool], total_relevant: usize) -> f64 {
+ if total_relevant == 0 {
+ return 0.0;
+ }
+ hits_in_top_k.iter().filter(|&&h| h).count() as f64 / total_relevant as f64
+}
+
+/// Return the value at the given percentile from a sorted slice.
+pub fn percentile(sorted_values: &[u64], p: f64) -> u64 {
+ if sorted_values.is_empty() {
+ return 0;
+ }
+ let idx = ((p / 100.0) * (sorted_values.len() as f64 - 1.0)).round() as usize;
+ sorted_values[idx.min(sorted_values.len() - 1)]
+}
+
+/// Compute compression ratio: how much smaller the context_tokens are compared to raw input.
+///
+/// Formula: `1.0 - (context_tokens as f64 / raw_tokens as f64)`
+///
+/// - `context_tokens`: tokens_estimated returned by the memory search JSON envelope.
+/// - `raw_tokens`: derived by counting total characters across all JSONL setup lines,
+/// divided by 4.0 as a standard chars-per-token approximation.
+///
+/// Returns 0.0 if raw_tokens is 0 (prevents divide-by-zero).
+pub fn compute_compression_ratio(context_tokens: usize, raw_tokens: usize) -> f64 {
+ if raw_tokens == 0 {
+ return 0.0;
+ }
+ 1.0 - (context_tokens as f64 / raw_tokens as f64)
+}
+
+/// Estimate raw token count from JSONL setup strings (TestCase.setup lines).
+/// Sums character lengths of all setup strings and divides by 4 (chars-per-token approximation).
+pub fn estimate_raw_tokens(setup_lines: &[String]) -> usize {
+ let total_chars: usize = setup_lines.iter().map(|s| s.len()).sum();
+ (total_chars as f64 / 4.0).ceil() as usize
+}
+
+/// Aggregated benchmark report with all computed metrics.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, PartialEq)]
+pub struct BenchmarkReport {
+ pub accuracy: f64,
+ pub recall_at_5: f64,
+ pub token_usage_avg: usize,
+ pub latency_p50_ms: u64,
+ pub latency_p95_ms: u64,
+ pub compression_ratio: f64,
+ pub test_count: usize,
+ pub pass_count: usize,
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_score_hit_when_expected_present() {
+ assert!(score_result(
+ "We chose JWT for stateless auth",
+ &["JWT".to_string()]
+ ));
+ }
+
+ #[test]
+ fn test_score_miss_when_none_present() {
+ assert!(!score_result(
+ "We chose sessions with cookies",
+ &["JWT".to_string()]
+ ));
+ }
+
+ #[test]
+ fn test_score_case_insensitive() {
+ assert!(score_result("JWT tokens are great", &["jwt".to_string()]));
+ }
+
+ #[test]
+ fn test_accuracy_all_hits() {
+ let hits = vec![true, true, true];
+ assert!((compute_accuracy(&hits) - 1.0).abs() < f64::EPSILON);
+ }
+
+ #[test]
+ fn test_accuracy_partial() {
+ let hits = vec![true, false, true];
+ let acc = compute_accuracy(&hits);
+ assert!((acc - 2.0 / 3.0).abs() < 0.001);
+ }
+
+ #[test]
+ fn test_accuracy_empty() {
+ assert!((compute_accuracy(&[]) - 0.0).abs() < f64::EPSILON);
+ }
+
+ #[test]
+ fn test_percentile_p50() {
+ let values = vec![10, 20, 30, 40, 50];
+ assert_eq!(percentile(&values, 50.0), 30);
+ }
+
+ #[test]
+ fn test_percentile_p95() {
+ let values = vec![10, 20, 30, 40, 50];
+ assert_eq!(percentile(&values, 95.0), 50);
+ }
+
+ #[test]
+ fn test_compression_ratio_typical() {
+ let ratio = compute_compression_ratio(250, 1000);
+ assert!((ratio - 0.75).abs() < f64::EPSILON);
+ }
+
+ #[test]
+ fn test_compression_ratio_zero_raw() {
+ assert!((compute_compression_ratio(100, 0) - 0.0).abs() < f64::EPSILON);
+ }
+
+ #[test]
+ fn test_estimate_raw_tokens() {
+ let lines = vec!["hello world".to_string()];
+ // ceil(11/4) = 3
+ assert_eq!(estimate_raw_tokens(&lines), 3);
+ }
+}