From 7d39527259401ef542a53feb8d30afd3ed5eddcc Mon Sep 17 00:00:00 2001 From: lanceretter Date: Fri, 8 May 2026 08:38:39 -0400 Subject: [PATCH 01/41] fix: bootstrap forward-references for v39-v41 schema replay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three column-with-index forward references in the embedded schema blob were missing from applyForwardReferenceBootstrap, so any brain at config.version < 39 (Postgres) or < 41 (PGLite) wedges before the migration runner can advance. Reproduced end-to-end on a PlanetScale Postgres brain stuck at config.version=34 trying to upgrade to v0.30.0: ERROR: column "effective_date" does not exist ERROR: column cc.modality does not exist (After upgrading, gbrain search and gbrain reindex-frontmatter both fail.) The schema-blob references that crash before migrations run: - v39 (multimodal_dual_column_v0_27_1): CREATE INDEX idx_chunks_embedding_image ON content_chunks USING hnsw (embedding_image vector_cosine_ops) WHERE embedding_image IS NOT NULL; - v41 (pages_recency_columns): CREATE INDEX pages_coalesce_date_idx ON pages ((COALESCE(effective_date, updated_at))); PGLite already covered v39 (lines 273+, 308+, 382-392). Postgres and PGLite both lacked v40+v41 coverage. This commit adds: - Postgres engine probe + branch for v39 (modality, embedding_image) — was entirely missing on Postgres, so Postgres brains < v39 hit the wedge that PGLite already protected against. - Both engines: probe + branch for v40+v41. Bootstraps all five additive pages columns (emotional_weight, effective_date, effective_date_source, import_filename, salience_touched_at) gated on `effective_date_exists` as the proxy. - test/schema-bootstrap-coverage.test.ts: extends REQUIRED_BOOTSTRAP_COVERAGE with the six new columns AND the pre-test DROP block so both the per-target assertion test and the end-to-end "bootstrap + SCHEMA_SQL replay" test exercise the new coverage. All 5 tests in schema-bootstrap-coverage pass. typecheck clean. Bootstrap stays additive-columns-only. Indexes are left to schema replay / migrations as before. --- src/core/pglite-engine.ts | 27 +++++++++++++- src/core/postgres-engine.ts | 50 ++++++++++++++++++++++++-- test/schema-bootstrap-coverage.test.ts | 31 ++++++++++++++++ 3 files changed, 105 insertions(+), 3 deletions(-) diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 7b3d4c066..1439266be 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -271,6 +271,8 @@ export class PGLiteEngine implements BrainEngine { WHERE table_schema='public' AND table_name='content_chunks' AND column_name='search_vector') AS search_vector_exists, EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema='public' AND table_name='content_chunks' AND column_name='embedding_image') AS embedding_image_exists, + EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_schema='public' AND table_name='pages' AND column_name='effective_date') AS effective_date_exists, EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema='public' AND table_name='mcp_request_log') AS mcp_log_exists, EXISTS (SELECT 1 FROM information_schema.columns @@ -292,6 +294,7 @@ export class PGLiteEngine implements BrainEngine { language_exists: boolean; search_vector_exists: boolean; embedding_image_exists: boolean; + effective_date_exists: boolean; mcp_log_exists: boolean; agent_name_exists: boolean; subagent_messages_exists: boolean; @@ -311,11 +314,16 @@ export class PGLiteEngine implements BrainEngine { // v0.27 (v36): idx_subagent_messages_provider in PGLITE_SCHEMA_SQL needs // provider_id (the SECOND column in the composite index `(job_id, provider_id)`). const needsSubagentProviderId = probe.subagent_messages_exists && !probe.subagent_provider_id_exists; + // v0.29.1 (v40 + v41): pages_coalesce_date_idx expression index in + // PGLITE_SCHEMA_SQL references effective_date. Use effective_date_exists + // as the proxy for the five v40 + v41 pages columns. + const needsPagesRecency = probe.pages_exists && !probe.effective_date_exists; // Fresh installs (no tables yet) and modern brains both no-op. if (!needsPagesBootstrap && !needsLinksBootstrap && !needsChunksBootstrap && !needsPagesDeletedAt && !needsChunksEmbeddingImage - && !needsMcpLogBootstrap && !needsSubagentProviderId) return; + && !needsMcpLogBootstrap && !needsSubagentProviderId + && !needsPagesRecency) return; console.log(' Pre-v0.21 brain detected, applying forward-reference bootstrap'); @@ -415,6 +423,23 @@ export class PGLiteEngine implements BrainEngine { ALTER TABLE subagent_messages ADD COLUMN IF NOT EXISTS provider_id TEXT; `); } + + if (needsPagesRecency) { + // v40 (pages_emotional_weight) adds emotional_weight; v41 + // (pages_recency_columns) adds effective_date + effective_date_source + + // import_filename + salience_touched_at and the + // `pages_coalesce_date_idx ON pages ((COALESCE(effective_date, updated_at)))` + // expression index. PGLITE_SCHEMA_SQL's CREATE INDEX for that expression + // crashes before v41 runs. Bootstrap adds all five additive columns; + // v40 + v41 run later via runMigrations and are idempotent. + await this.db.exec(` + ALTER TABLE pages ADD COLUMN IF NOT EXISTS emotional_weight REAL NOT NULL DEFAULT 0.0; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date TIMESTAMPTZ; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date_source TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS import_filename TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS salience_touched_at TIMESTAMPTZ; + `); + } } async withReservedConnection(fn: (conn: ReservedConnection) => Promise): Promise { diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index ad3f0ad6b..225b7ed9d 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -301,6 +301,7 @@ export class PostgresEngine implements BrainEngine { pages_exists: boolean; source_id_exists: boolean; deleted_at_exists: boolean; + effective_date_exists: boolean; links_exists: boolean; link_source_exists: boolean; origin_page_id_exists: boolean; @@ -308,6 +309,7 @@ export class PostgresEngine implements BrainEngine { symbol_name_exists: boolean; language_exists: boolean; search_vector_exists: boolean; + embedding_image_exists: boolean; mcp_log_exists: boolean; agent_name_exists: boolean; subagent_messages_exists: boolean; @@ -320,6 +322,8 @@ export class PostgresEngine implements BrainEngine { WHERE table_schema = current_schema() AND table_name = 'pages' AND column_name = 'source_id') AS source_id_exists, EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = current_schema() AND table_name = 'pages' AND column_name = 'deleted_at') AS deleted_at_exists, + EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_schema = current_schema() AND table_name = 'pages' AND column_name = 'effective_date') AS effective_date_exists, EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = current_schema() AND table_name = 'links') AS links_exists, EXISTS (SELECT 1 FROM information_schema.columns @@ -334,6 +338,8 @@ export class PostgresEngine implements BrainEngine { WHERE table_schema = current_schema() AND table_name = 'content_chunks' AND column_name = 'language') AS language_exists, EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = current_schema() AND table_name = 'content_chunks' AND column_name = 'search_vector') AS search_vector_exists, + EXISTS (SELECT 1 FROM information_schema.columns + WHERE table_schema = current_schema() AND table_name = 'content_chunks' AND column_name = 'embedding_image') AS embedding_image_exists, EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = current_schema() AND table_name = 'mcp_request_log') AS mcp_log_exists, EXISTS (SELECT 1 FROM information_schema.columns @@ -358,8 +364,19 @@ export class PostgresEngine implements BrainEngine { // v0.27 (v36): idx_subagent_messages_provider in SCHEMA_SQL needs provider_id // (the SECOND column in the composite index `(job_id, provider_id)`). const needsSubagentProviderId = probe.subagent_messages_exists && !probe.subagent_provider_id_exists; - - if (!needsPagesBootstrap && !needsLinksBootstrap && !needsChunksBootstrap && !needsPagesDeletedAt && !needsMcpLogBootstrap && !needsSubagentProviderId) return; + // v0.27.1 (v39): idx_chunks_embedding_image partial HNSW in SCHEMA_SQL + // references embedding_image. Use embedding_image_exists as the proxy for + // both v39 columns; modality is added in the same migration. + const needsChunksEmbeddingImage = probe.chunks_exists && !probe.embedding_image_exists; + // v0.29.1 (v40 + v41): pages_coalesce_date_idx expression index in SCHEMA_SQL + // references effective_date. Use effective_date_exists as the proxy for the + // five v40 + v41 pages columns (emotional_weight, effective_date, + // effective_date_source, import_filename, salience_touched_at). + const needsPagesRecency = probe.pages_exists && !probe.effective_date_exists; + + if (!needsPagesBootstrap && !needsLinksBootstrap && !needsChunksBootstrap + && !needsPagesDeletedAt && !needsMcpLogBootstrap && !needsSubagentProviderId + && !needsChunksEmbeddingImage && !needsPagesRecency) return; console.log(' Pre-v0.21 brain detected, applying forward-reference bootstrap'); @@ -447,6 +464,35 @@ export class PostgresEngine implements BrainEngine { ALTER TABLE subagent_messages ADD COLUMN IF NOT EXISTS provider_id TEXT; `); } + + if (needsChunksEmbeddingImage) { + // v39 (multimodal_dual_column_v0_27_1) adds modality + embedding_image + // columns to content_chunks plus a partial HNSW index that references + // embedding_image. Bootstrap mirrors enough state for SCHEMA_SQL's + // `CREATE INDEX idx_chunks_embedding_image ... WHERE embedding_image IS NOT NULL` + // not to crash. v39 runs later via runMigrations and is idempotent. + await conn.unsafe(` + ALTER TABLE content_chunks ADD COLUMN IF NOT EXISTS modality TEXT NOT NULL DEFAULT 'text'; + ALTER TABLE content_chunks ADD COLUMN IF NOT EXISTS embedding_image vector(1024); + `); + } + + if (needsPagesRecency) { + // v40 (pages_emotional_weight) adds emotional_weight; v41 + // (pages_recency_columns) adds effective_date + effective_date_source + + // import_filename + salience_touched_at and the + // `pages_coalesce_date_idx ON pages ((COALESCE(effective_date, updated_at)))` + // expression index. SCHEMA_SQL's CREATE INDEX for that expression crashes + // before v41 runs. Bootstrap adds all five additive columns; v40 + v41 + // run later via runMigrations and are idempotent. + await conn.unsafe(` + ALTER TABLE pages ADD COLUMN IF NOT EXISTS emotional_weight REAL NOT NULL DEFAULT 0.0; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date TIMESTAMPTZ; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS effective_date_source TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS import_filename TEXT; + ALTER TABLE pages ADD COLUMN IF NOT EXISTS salience_touched_at TIMESTAMPTZ; + `); + } } async transaction(fn: (engine: BrainEngine) => Promise): Promise { diff --git a/test/schema-bootstrap-coverage.test.ts b/test/schema-bootstrap-coverage.test.ts index fb4efebf7..0a116af28 100644 --- a/test/schema-bootstrap-coverage.test.ts +++ b/test/schema-bootstrap-coverage.test.ts @@ -77,6 +77,10 @@ const REQUIRED_BOOTSTRAP_COVERAGE: ForwardReference[] = [ // ON content_chunks USING hnsw (embedding_image vector_cosine_ops) // WHERE embedding_image IS NOT NULL`. { kind: 'column', table: 'content_chunks', column: 'embedding_image' }, + // v0.27.1 — added in the same migration as embedding_image. Sibling column; + // not directly forward-referenced by an index but the bootstrap adds it + // alongside embedding_image for the v39 contract. + { kind: 'column', table: 'content_chunks', column: 'modality' }, // v0.26.3 (v33) — forward-referenced by `CREATE INDEX idx_mcp_log_agent_time // ON mcp_request_log(agent_name, created_at DESC)`. { kind: 'column', table: 'mcp_request_log', column: 'agent_name' }, @@ -86,6 +90,19 @@ const REQUIRED_BOOTSTRAP_COVERAGE: ForwardReference[] = [ // by default, which is why this fix wave's Step 3 replaces this with a // SQL parser that extracts every column referenced by any DDL. { kind: 'column', table: 'subagent_messages', column: 'provider_id' }, + // v0.29 (v40) — pages.emotional_weight populated by recompute_emotional_weight; + // bootstrapped alongside the v41 columns since they share the v0.29.1 wave. + { kind: 'column', table: 'pages', column: 'emotional_weight' }, + // v0.29.1 (v41) — forward-referenced by `CREATE INDEX pages_coalesce_date_idx + // ON pages ((COALESCE(effective_date, updated_at)))`. The expression-index + // claim from earlier plan iterations was wrong; PG's planner won't use a + // partial index for the negative side of a COALESCE — expression index is. + { kind: 'column', table: 'pages', column: 'effective_date' }, + // v0.29.1 (v41) — sibling columns added in the same migration as + // effective_date; bootstrap adds them all together. + { kind: 'column', table: 'pages', column: 'effective_date_source' }, + { kind: 'column', table: 'pages', column: 'import_filename' }, + { kind: 'column', table: 'pages', column: 'salience_touched_at' }, ]; test('applyForwardReferenceBootstrap covers every forward reference declared in REQUIRED_BOOTSTRAP_COVERAGE', async () => { @@ -139,6 +156,13 @@ test('applyForwardReferenceBootstrap covers every forward reference declared in DROP INDEX IF EXISTS idx_subagent_messages_provider; ALTER TABLE subagent_messages DROP COLUMN IF EXISTS provider_id; + + DROP INDEX IF EXISTS pages_coalesce_date_idx; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date_source; + ALTER TABLE pages DROP COLUMN IF EXISTS import_filename; + ALTER TABLE pages DROP COLUMN IF EXISTS salience_touched_at; + ALTER TABLE pages DROP COLUMN IF EXISTS emotional_weight; `); // Run bootstrap in isolation (NOT initSchema). This is what we're testing. @@ -198,6 +222,13 @@ test('after bootstrap, PGLITE_SCHEMA_SQL replays without crashing on missing for DROP INDEX IF EXISTS idx_chunks_embedding_image; ALTER TABLE content_chunks DROP COLUMN IF EXISTS embedding_image; ALTER TABLE content_chunks DROP COLUMN IF EXISTS modality; + + DROP INDEX IF EXISTS pages_coalesce_date_idx; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date_source; + ALTER TABLE pages DROP COLUMN IF EXISTS import_filename; + ALTER TABLE pages DROP COLUMN IF EXISTS salience_touched_at; + ALTER TABLE pages DROP COLUMN IF EXISTS emotional_weight; `); // Bootstrap, then schema replay. Either step crashing fails the test. From c5bb5142aaa0c0f99de3c9ccaa9d652f600bf88f Mon Sep 17 00:00:00 2001 From: lanceretter Date: Fri, 8 May 2026 08:22:40 -0400 Subject: [PATCH 02/41] fix(deps): declare @jsquash/png and heic-decode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both packages are direct imports in src/core/import-file.ts (decodeIfNeeded for HEIC/AVIF → PNG) but only @jsquash/avif was declared. bun --compile fails on a fresh install: error: Could not resolve: "@jsquash/png/encode.js" error: Could not resolve: "heic-decode" Adds the missing declarations so npm install / bun install bring them in. Versions chosen as latest at time of fix: @jsquash/png ^3.1.1 heic-decode ^2.1.0 --- bun.lock | 8 ++++++++ package.json | 2 ++ 2 files changed, 10 insertions(+) diff --git a/bun.lock b/bun.lock index 39d7ed61c..86f62516e 100644 --- a/bun.lock +++ b/bun.lock @@ -14,6 +14,7 @@ "@dqbd/tiktoken": "^1.0.22", "@electric-sql/pglite": "0.4.3", "@jsquash/avif": "^2.1.1", + "@jsquash/png": "^3.1.1", "@modelcontextprotocol/sdk": "1.29.0", "ai": "^6.0.168", "cookie-parser": "^1.4.7", @@ -23,6 +24,7 @@ "express": "^5.1.0", "express-rate-limit": "^7.5.0", "gray-matter": "^4.0.3", + "heic-decode": "^2.1.0", "marked": "^18.0.0", "openai": "^4.0.0", "pgvector": "^0.2.0", @@ -149,6 +151,8 @@ "@jsquash/avif": ["@jsquash/avif@2.1.1", "", { "dependencies": { "wasm-feature-detect": "^1.2.11" } }, "sha512-LMRxd0fMgfCLtobDh0/sFYJMMiRJTNYSEEWvRDKXlAeZ08t3gI5V+1thIT0XjXJ+SVG7Zug9B0XPyx0Ti5VRNA=="], + "@jsquash/png": ["@jsquash/png@3.1.1", "", {}, "sha512-C10pc+0H6j0h8fENOfnGOvkXCmvpSQTDGlfGd0sHphZhPSGTyLjIrHba0FaZZdsKqA/wlmhYicUHb92vfZphaw=="], + "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="], "@opentelemetry/api": ["@opentelemetry/api@1.9.0", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="], @@ -405,6 +409,8 @@ "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + "heic-decode": ["heic-decode@2.1.0", "", { "dependencies": { "libheif-js": "^1.19.8" } }, "sha512-0fB3O3WMk38+PScbHLVp66jcNhsZ/ErtQ6u2lMYu/YxXgbBtl+oKOhGQHa4RpvE68k8IzbWkABzHnyAIjR758A=="], + "hono": ["hono@4.12.10", "", {}, "sha512-mx/p18PLy5og9ufies2GOSUqep98Td9q4i/EF6X7yJgAiIopxqdfIO3jbqsi3jRgTgw88jMDEzVKi+V2EF+27w=="], "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="], @@ -437,6 +443,8 @@ "kind-of": ["kind-of@6.0.3", "", {}, "sha512-dcS1ul+9tmeD95T+x28/ehLgd9mENa3LsvDTtzm3vyBEO7RPptvAD+t44WVXaUjTBRcrpFeFlC8WCruUR456hw=="], + "libheif-js": ["libheif-js@1.19.8", "", {}, "sha512-vQJWusIxO7wavpON1dusciL8Go9jsIQ+EUrckauFYAiSTjcmLAsuJh3SszLpvkwPci3JcL41ek2n+LUZGFpPIQ=="], + "marked": ["marked@18.0.0", "", { "bin": { "marked": "bin/marked.js" } }, "sha512-2e7Qiv/HJSXj8rDEpgTvGKsP8yYtI9xXHKDnrftrmnrJPaFNM7VRb2YCzWaX4BP1iCJ/XPduzDJZMFoqTCcIMA=="], "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], diff --git a/package.json b/package.json index e6b27e370..347104bb2 100644 --- a/package.json +++ b/package.json @@ -75,6 +75,7 @@ "@dqbd/tiktoken": "^1.0.22", "@electric-sql/pglite": "0.4.3", "@jsquash/avif": "^2.1.1", + "@jsquash/png": "^3.1.1", "@modelcontextprotocol/sdk": "1.29.0", "ai": "^6.0.168", "cookie-parser": "^1.4.7", @@ -84,6 +85,7 @@ "express": "^5.1.0", "express-rate-limit": "^7.5.0", "gray-matter": "^4.0.3", + "heic-decode": "^2.1.0", "marked": "^18.0.0", "openai": "^4.0.0", "pgvector": "^0.2.0", From 8677b8d32b0d382077d9235fa3ebc726b7af58e6 Mon Sep 17 00:00:00 2001 From: lanceretter Date: Fri, 8 May 2026 08:22:55 -0400 Subject: [PATCH 03/41] fix(backfill-effective-date): replace bare BEGIN/COMMIT with engine.transaction() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit postgres.js refuses bare BEGIN/COMMIT on pooled connections with UNSAFE_TRANSACTION. The migration runner and other call sites already use engine.transaction() (which routes through sql.begin() with a reserved backend) — backfill-effective-date.ts was the holdout. Reproduces on PlanetScale Postgres (us-east-4.pg.psdb.cloud) running the v0.29.1 orchestrator's Phase B against a brain that has any rows needing backfill: Reindex ok ... UNSAFE_TRANSACTION: Only use sql.begin, sql.reserved or max: 1 Switches the per-batch transaction to engine.transaction(async tx => …). The SET LOCAL statement_timeout still scopes to the transaction; UPDATE runs through the tx-scoped engine. ROLLBACK on error happens automatically via sql.begin's contract. Equivalent fix shape to existing usages in src/core/postgres-engine.ts (lines 703, 806, 925) and the migration runner in src/core/migrate.ts (line 2147). --- src/core/backfill-effective-date.ts | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/core/backfill-effective-date.ts b/src/core/backfill-effective-date.ts index 807d45142..4a06a77d8 100644 --- a/src/core/backfill-effective-date.ts +++ b/src/core/backfill-effective-date.ts @@ -175,14 +175,14 @@ export async function backfillEffectiveDate( if (!opts.dryRun) { // Compute effective_date for each row, then UPDATE in a batch wrapped // in its own transaction (so SET LOCAL statement_timeout scopes to it). - // postgres.js's `transaction` would be cleaner but we're using executeRaw - // for engine portability; explicit BEGIN/COMMIT does the same on both. - if (isPostgres) { - await engine.executeRaw(`BEGIN`); - await engine.executeRaw(`SET LOCAL statement_timeout = '600s'`); - } + // postgres.js refuses bare BEGIN/COMMIT on pooled connections + // (UNSAFE_TRANSACTION); engine.transaction() routes through sql.begin() + // which uses a reserved backend. + await engine.transaction(async (tx) => { + if (isPostgres) { + await tx.executeRaw(`SET LOCAL statement_timeout = '600s'`); + } - try { for (const r of rows) { const fm = parseFrontmatter(r.frontmatter); const filename = r.import_filename @@ -204,21 +204,14 @@ export async function backfillEffectiveDate( if (!opts.force && datesMatch && sourcesMatch) continue; - await engine.executeRaw( + await tx.executeRaw( `UPDATE pages SET effective_date = $1::timestamptz, effective_date_source = $2 WHERE id = $3`, [computed.date ? computed.date.toISOString() : null, computed.source, r.id], ); touched++; if (computed.source === 'fallback') fallback++; } - - if (isPostgres) await engine.executeRaw(`COMMIT`); - } catch (e) { - if (isPostgres) { - try { await engine.executeRaw(`ROLLBACK`); } catch { /* ignore */ } - } - throw e; - } + }); } else { // Dry run: still count what WOULD change. for (const r of rows) { From 93f4d94808640c5894d6ab348def320accb0df12 Mon Sep 17 00:00:00 2001 From: lanceretter Date: Fri, 8 May 2026 08:23:06 -0400 Subject: [PATCH 04/41] fix(v0_29_1): connect engine before use in Phase B and Phase C MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit phaseBBackfill() and phaseCVerify() build their own engine via createEngine(toEngineConfig(cfg)) but never call engine.connect(). This worked accidentally before because executeRaw lazily falls back to db.getConnection(), but engine.transaction() (added in the companion backfill fix) requires a connected backend and surfaces the missing-connect with: No database connection: connect() has not been called. Fix: Run gbrain init --supabase or gbrain init --url Other orchestrators in the same directory get this right — v0_28_0.ts:181 already does `await engine.connect(engineConfig)` right after createEngine. Aligning v0_29_1 with that pattern. After this + the backfill fix, v0.29.1 orchestrator runs to 'complete' on a fresh upgrade with backfill-needed rows, instead of wedging at 'partial' status. Note: anyone hitting the wedged state after the prior failures will need `gbrain apply-migrations --force-retry 0.29.1` once before the next apply-migrations --yes succeeds (the 3-consecutive-partials guard in apply-migrations.ts is still active). --- src/commands/migrations/v0_29_1.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/commands/migrations/v0_29_1.ts b/src/commands/migrations/v0_29_1.ts index 38b677d6c..f71165d3c 100644 --- a/src/commands/migrations/v0_29_1.ts +++ b/src/commands/migrations/v0_29_1.ts @@ -48,7 +48,9 @@ async function phaseBBackfill(opts: OrchestratorOpts): Promise Date: Fri, 8 May 2026 22:37:52 +0200 Subject: [PATCH 05/41] fix: connect engine in v0.29.1 migration --- src/commands/migrations/v0_29_1.ts | 15 ++++++-- test/migration-v0-29-1.test.ts | 58 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 test/migration-v0-29-1.test.ts diff --git a/src/commands/migrations/v0_29_1.ts b/src/commands/migrations/v0_29_1.ts index f71165d3c..ea9c1ed2b 100644 --- a/src/commands/migrations/v0_29_1.ts +++ b/src/commands/migrations/v0_29_1.ts @@ -19,6 +19,7 @@ */ import { execSync } from 'child_process'; +import type { BrainEngine } from '../../core/engine.ts'; import type { Migration, OrchestratorOpts, OrchestratorResult, OrchestratorPhaseResult } from './types.ts'; import { childGlobalFlags } from '../../core/cli-options.ts'; @@ -42,6 +43,7 @@ function phaseASchema(opts: OrchestratorOpts): OrchestratorPhaseResult { async function phaseBBackfill(opts: OrchestratorOpts): Promise { if (opts.dryRun) return { name: 'backfill_effective_date', status: 'skipped', detail: 'dry-run' }; + let engine: BrainEngine | null = null; try { const { createEngine } = await import('../../core/engine-factory.ts'); const { loadConfig, toEngineConfig } = await import('../../core/config.ts'); @@ -49,7 +51,7 @@ async function phaseBBackfill(opts: OrchestratorOpts): Promise { if (opts.dryRun) return { name: 'verify', status: 'skipped', detail: 'dry-run' }; + let engine: BrainEngine | null = null; try { const { createEngine } = await import('../../core/engine-factory.ts'); const { loadConfig, toEngineConfig } = await import('../../core/config.ts'); const cfg = loadConfig(); if (!cfg) throw new Error('No gbrain config; run `gbrain init` first.'); const engineConfig = toEngineConfig(cfg); - const engine = await createEngine(engineConfig); + engine = await createEngine(engineConfig); await engine.connect(engineConfig); // Count rows where effective_date is still NULL but frontmatter HAS a // parseable date — those are the rows the backfill should have touched @@ -105,6 +112,10 @@ async function phaseCVerify(opts: OrchestratorOpts): Promise { + let tmp: string; + let oldGbrainHome: string | undefined; + + beforeEach(async () => { + oldGbrainHome = process.env.GBRAIN_HOME; + tmp = mkdtempSync(join(tmpdir(), 'gbrain-v0291-')); + process.env.GBRAIN_HOME = tmp; + + const gbrainHome = join(tmp, '.gbrain'); + const dbPath = join(tmp, 'brain-db'); + mkdirSync(gbrainHome, { recursive: true }); + writeFileSync( + join(gbrainHome, 'config.json'), + JSON.stringify({ engine: 'pglite', database_path: dbPath }, null, 2) + '\n', + ); + + const engine = await createEngine({ engine: 'pglite', database_path: dbPath }); + await engine.connect({ engine: 'pglite', database_path: dbPath }); + try { + await engine.initSchema(); + } finally { + await engine.disconnect(); + } + }); + + afterEach(() => { + if (oldGbrainHome === undefined) delete process.env.GBRAIN_HOME; + else process.env.GBRAIN_HOME = oldGbrainHome; + rmSync(tmp, { recursive: true, force: true }); + }); + + test('connects the PGLite engine before backfill and verify phases', async () => { + const backfill = await __testing.phaseBBackfill(opts); + expect(backfill.status).toBe('complete'); + expect(backfill.detail).toContain('examined=0'); + + const verify = await __testing.phaseCVerify(opts); + expect(verify).toEqual({ + name: 'verify', + status: 'complete', + detail: '0 pages with NULL effective_date', + }); + }); +}); From 918cb144262d0c473afcadfd457d3c2c98bc7a76 Mon Sep 17 00:00:00 2001 From: WD Date: Thu, 7 May 2026 17:22:42 +0800 Subject: [PATCH 06/41] fix(upgrade): detectBunLink fails because bun resolves symlinks in argv[1] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bun resolves the entire symlink chain before setting process.argv[1], so lstatSync(argv1).isSymbolicLink() always returns false for bun-link installs, short-circuiting the git-config walk that would correctly identify the repo. Remove the symlink gate — argv[1] is already the real path inside the checkout, which is what the walk needs. Also: return { repoRoot } so the upgrade path can auto-execute git pull + bun install via execFileSync (no shell injection surface). Fixes #368, supersedes incomplete v0.28.5 fix for #656. --- src/commands/upgrade.ts | 78 +++++++++++++++++++---------------------- test/upgrade.test.ts | 28 +++++++++++++-- 2 files changed, 61 insertions(+), 45 deletions(-) diff --git a/src/commands/upgrade.ts b/src/commands/upgrade.ts index 2787b966c..b736215be 100644 --- a/src/commands/upgrade.ts +++ b/src/commands/upgrade.ts @@ -1,6 +1,6 @@ -import { execSync } from 'child_process'; -import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, realpathSync, lstatSync } from 'fs'; -import { join, dirname } from 'path'; +import { execSync, execFileSync } from 'child_process'; +import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, realpathSync } from 'fs'; +import { join, dirname, resolve } from 'path'; import { VERSION } from '../version.ts'; const GBRAIN_GITHUB_REPO = 'garrytan/gbrain'; @@ -19,17 +19,23 @@ export async function runUpgrade(args: string[]) { let upgraded = false; switch (method) { - case 'bun-link': - // v0.28.5: bun-link installs are source clones. Pull + bun install - // is the upgrade path; npm/bun's update mechanism doesn't apply. - console.log('Upgrading via bun-link source clone...'); - console.log(' cd into your gbrain checkout, then run:'); - console.log(' git pull'); - console.log(' bun install'); - console.log(' bun link'); - console.log(''); - console.log(' (auto-detect can\'t do this for you because it doesn\'t know which checkout to update.)'); + case 'bun-link': { + const linkInfo = detectBunLink(); + if (!linkInfo) { + console.error('bun-link detected but could not resolve repo root.'); + break; + } + console.log(`Upgrading bun-link source clone at ${linkInfo.repoRoot}...`); + try { + execFileSync('git', ['-C', linkInfo.repoRoot, 'pull', '--ff-only'], { stdio: 'inherit', timeout: 120_000 }); + execFileSync('bun', ['install'], { cwd: linkInfo.repoRoot, stdio: 'inherit', timeout: 120_000 }); + upgraded = true; + } catch { + console.error('Auto-upgrade failed. Try manually:'); + console.error(` cd ${linkInfo.repoRoot} && git pull && bun install`); + } break; + } case 'bun': console.log('Upgrading via bun...'); @@ -298,7 +304,7 @@ export function detectInstallMethod(): 'bun' | 'bun-link' | 'binary' | 'clawhub' // resolves into a directory we can walk up from to find a .git/config // pointing at our repo. const bunLinkResult = detectBunLink(); - if (bunLinkResult === 'bun-link') return 'bun-link'; + if (bunLinkResult) return 'bun-link'; // Check if running from node_modules (bun/npm install). Could be canonical // (we publish under garrytan/gbrain) OR the squatter (npm `gbrain@1.3.x`). @@ -328,51 +334,39 @@ export function detectInstallMethod(): 'bun' | 'bun-link' | 'binary' | 'clawhub' } /** - * v0.28.5 cluster D, signal 1 — bun-link detection (closes #656). + * Detect bun-link source-clone installs (closes #656, fixes #368). * - * argv[1] is what `bun /path/to/cli.ts` was invoked with. When `bun link` - * is in play, that path is typically a symlink (~/.bun/bin/gbrain) to - * either the source repo's compiled binary or src/cli.ts directly. - * Walk up from the realpath looking for a `.git/config` whose remote - * url contains `garrytan/gbrain` (case-insensitive substring). + * Walk up from argv[1] looking for a `.git/config` whose remote url + * contains `garrytan/gbrain` (case-insensitive substring). * - * Returns 'bun-link' when we're confident; null otherwise (caller falls - * through to the existing detection chain). Best-effort: forks, tarball - * installs, detached source trees, and `.git`-less installs all fall - * through, which is acceptable per codex's plan-review feedback. + * v0.28.5 gated on lstatSync(argv1).isSymbolicLink(), but bun resolves + * the entire symlink chain before setting process.argv[1], so the check + * always returned false and short-circuited detection. Now we skip the + * symlink check and use argv[1] directly — it is already the real path + * inside the checkout, which is exactly what the git-config walk needs. + * + * Returns { repoRoot } when confident; null otherwise (caller falls + * through to the existing detection chain). */ -function detectBunLink(): 'bun-link' | null { +function detectBunLink(): { repoRoot: string } | null { try { const argv1 = process.argv[1]; if (!argv1) return null; - // Symlink check first: `bun link` always creates one. - let isSymlink = false; - try { - isSymlink = lstatSync(argv1).isSymbolicLink(); - } catch { - return null; - } - if (!isSymlink) return null; - - const resolved = realpathSync(argv1); - let dir = dirname(resolved); - // Walk up at most 6 levels looking for .git/config. + let dir = dirname(resolve(argv1)); for (let i = 0; i < 6; i++) { const gitConfigPath = join(dir, '.git', 'config'); if (existsSync(gitConfigPath)) { try { const cfg = readFileSync(gitConfigPath, 'utf-8'); - // Loose substring match: covers https://, git@, ssh://, fork URLs - // that mention upstream in [remote "upstream"], and case variants. if (cfg.toLowerCase().includes(GBRAIN_GITHUB_REPO.toLowerCase())) { - return 'bun-link'; + return { repoRoot: dir }; } } catch { /* unreadable config — not our case */ } - return null; // found .git/config but no match → not our repo + return null; } const parent = dirname(dir); - if (parent === dir) break; // reached filesystem root + if (parent === dir) break; dir = parent; } return null; diff --git a/test/upgrade.test.ts b/test/upgrade.test.ts index 2f30b3507..b19d51572 100644 --- a/test/upgrade.test.ts +++ b/test/upgrade.test.ts @@ -74,14 +74,36 @@ describe('detectInstallMethod heuristic (source analysis)', () => { // v0.28.5 cluster D: 3-signal layered detection. test('bun-link signal walks .git/config for garrytan/gbrain match', () => { - // detectBunLink reads .git/config and matches our repo name as a - // case-insensitive substring. Confirm both the function exists and - // that it does the loose substring check (not a strict URL parse). expect(source).toContain('function detectBunLink'); expect(source).toContain('GBRAIN_GITHUB_REPO'); expect(source).toContain('toLowerCase()'); }); + test('detectBunLink does not gate on isSymbolicLink (bun resolves argv[1])', () => { + // v0.28.5 gated on lstatSync(argv1).isSymbolicLink() which always + // returned false because bun resolves symlinks before setting argv[1]. + // The function body between "function detectBunLink" and the next + // top-level function must not contain isSymbolicLink. + const fnStart = source.indexOf('function detectBunLink'); + const fnEnd = source.indexOf('\nfunction ', fnStart + 1); + const fnBody = source.slice(fnStart, fnEnd > -1 ? fnEnd : undefined); + expect(fnBody).not.toContain('isSymbolicLink'); + expect(fnBody).not.toContain('lstatSync'); + }); + + test('detectBunLink returns repoRoot, not a string literal', () => { + expect(source).toContain("{ repoRoot: string } | null"); + expect(source).toContain('repoRoot: dir'); + }); + + test('bun-link upgrade uses execFileSync for shell-injection safety', () => { + // execFileSync with array args bypasses the shell (same pattern as + // dry-fix.ts:172). execSync with template strings is vulnerable to + // paths containing shell metacharacters. + expect(source).toContain("execFileSync('git', ['-C', linkInfo.repoRoot, 'pull', '--ff-only']"); + expect(source).toContain("execFileSync('bun', ['install']"); + }); + test('classifyBunInstall checks repository.url AND src/cli.ts marker', () => { // Codex feedback: repository.url alone is spoofable by future squatter // updates; the source-marker fallback (src/cli.ts presence) is From 3b19e8328217f008ceea6ac38d9041ef70c72695 Mon Sep 17 00:00:00 2001 From: gus Date: Thu, 7 May 2026 22:22:05 -0300 Subject: [PATCH 07/41] =?UTF-8?q?fix(oauth):=20clamp=20authorize()=20reque?= =?UTF-8?q?sted=20scopes=20against=20client.scope=20(RFC=206749=20=C2=A73.?= =?UTF-8?q?3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MCP SDK's authorize handler (`@modelcontextprotocol/sdk/.../auth/handlers/authorize.js`) splits `?scope=...` verbatim and forwards the parsed list to the provider, so the provider has to clamp against the client's registered grant. v0.28.11 `authorize()` (src/core/oauth-provider.ts:235-259) inserted `params.scopes || []` raw into `oauth_codes`, so a `read`-registered client requesting `?scope=admin` had `['admin']` stored and `exchangeAuthorizationCode` issued a fully-admin access token at /token exchange. The asymmetry is the bug: the other two grant entry points already clamp. `exchangeClientCredentials` (line 513-515) filters requested scopes through `hasScope(allowedScopes, s)`, and `exchangeRefreshToken`'s F3 (line 372-380) enforces RFC 6749 §6 subset against the original grant. authorize() lined up with neither. Fix mirrors the client_credentials filter shape so all three grant entry points clamp consistently: const allowedScopes = parseScopeString(client.scope); const grantedScopes = (params.scopes || []).filter(s => hasScope(allowedScopes, s)); Empty/omitted requested scope keeps storing `[]` (existing shape, not a security boundary). The clamped subset is what the client sees in the `scope` field of the token response, which is the spec-compliant signal that the grant was reduced. Test coverage: - New: authorize clamps requested scopes against client.scope (RFC 6749 §3.3) — read-only client requests ['read','write','admin'] and the issued token carries only ['read']. - New: authorize subset request returns subset — 'read write' client requesting ['read'] gets ['read'] (regression guard against over-clamping). The existing v0.26.9 oauth.test.ts pins F3 (refresh clamp) but had no authorize-side coverage, which is why the regression survived. --- src/core/oauth-provider.ts | 15 +++++++++- test/oauth.test.ts | 56 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/core/oauth-provider.ts b/src/core/oauth-provider.ts index fcbeeed99..171b539df 100644 --- a/src/core/oauth-provider.ts +++ b/src/core/oauth-provider.ts @@ -241,11 +241,24 @@ export class GBrainOAuthProvider implements OAuthServerProvider { const codeHash = hashToken(code); const expiresAt = Math.floor(Date.now() / 1000) + 600; // 10 minute TTL + // Scope clamp (RFC 6749 §3.3): the SDK's authorize handler splits + // `?scope=...` verbatim and forwards the raw list to the provider, so + // the provider MUST clamp against the client's registered grant. Without + // this, a `read`-registered client requesting `?scope=admin` would have + // `['admin']` stored in oauth_codes and returned by exchangeAuthorizationCode + // as a fully-admin access token. Mirrors the filter pattern already used + // by exchangeClientCredentials (this file) and exchangeRefreshToken's F3 + // subset enforcement (RFC 6749 §6) so all three grant entry points clamp + // consistently. Empty/omitted requested scope inherits the empty-stored + // shape (existing behavior; not a security boundary). + const allowedScopes = parseScopeString(client.scope); + const grantedScopes = (params.scopes || []).filter(s => hasScope(allowedScopes, s)); + await this.sql` INSERT INTO oauth_codes (code_hash, client_id, scopes, code_challenge, code_challenge_method, redirect_uri, state, resource, expires_at) VALUES (${codeHash}, ${client.client_id}, - ${pgArray(params.scopes || [])}, + ${pgArray(grantedScopes)}, ${params.codeChallenge}, ${'S256'}, ${params.redirectUri}, ${params.state || null}, ${params.resource?.toString() || null}, ${expiresAt}) diff --git a/test/oauth.test.ts b/test/oauth.test.ts index dd20bfb5f..dd7829f18 100644 --- a/test/oauth.test.ts +++ b/test/oauth.test.ts @@ -388,6 +388,62 @@ describe('authorization code flow', () => { await expect(provider.exchangeAuthorizationCode(client, expiredCode)).rejects.toThrow(); }); + // F-AUTHZ regression. The MCP SDK's authorize handler splits `?scope=...` + // verbatim and forwards the raw list to the provider, so the provider must + // clamp against the client's registered grant. Pre-fix the INSERT into + // oauth_codes used `params.scopes || []` raw, so a `read`-registered client + // requesting `?scope=admin` got an admin access token at /token exchange. + // This pins the parallel posture to client_credentials' filter pattern + // (line 513-515) and refresh's F3 subset enforcement (RFC 6749 §6). + test('authorize clamps requested scopes against client.scope (RFC 6749 §3.3)', async () => { + const { clientId } = await provider.registerClientManual( + 'authz-clamp-test', ['authorization_code'], 'read', + ['http://localhost:3000/callback'], + ); + const client = (await provider.clientsStore.getClient(clientId))!; + + let redirectUrl = ''; + const mockRes = { redirect: (url: string) => { redirectUrl = url; } } as any; + + // Read-only client requests admin via the SDK's parsed scopes array. + await provider.authorize(client, { + codeChallenge: 'challenge', + redirectUri: 'http://localhost:3000/callback', + scopes: ['read', 'write', 'admin'], + }, mockRes); + + const code = new URL(redirectUrl).searchParams.get('code')!; + const tokens = await provider.exchangeAuthorizationCode(client, code); + + // The token's stored scopes must equal the clamped subset. + const auth = await provider.verifyAccessToken(tokens.access_token); + expect(auth.scopes).toEqual(['read']); + expect(auth.scopes).not.toContain('write'); + expect(auth.scopes).not.toContain('admin'); + }); + + test('authorize subset request returns subset', async () => { + const { clientId } = await provider.registerClientManual( + 'authz-subset-test', ['authorization_code'], 'read write', + ['http://localhost:3000/callback'], + ); + const client = (await provider.clientsStore.getClient(clientId))!; + + let redirectUrl = ''; + const mockRes = { redirect: (url: string) => { redirectUrl = url; } } as any; + + await provider.authorize(client, { + codeChallenge: 'challenge', + redirectUri: 'http://localhost:3000/callback', + scopes: ['read'], + }, mockRes); + + const code = new URL(redirectUrl).searchParams.get('code')!; + const tokens = await provider.exchangeAuthorizationCode(client, code); + const auth = await provider.verifyAccessToken(tokens.access_token); + expect(auth.scopes).toEqual(['read']); + }); + // CSO finding #2 regression. The pre-fix SELECT-then-DELETE pattern let two // concurrent token requests with the same code both pass the SELECT, both // running DELETE (no-op on second) and both calling issueTokens. The fix is From c0343daa1df343d1b4d428efe6410c31ed33b50e Mon Sep 17 00:00:00 2001 From: Trevin Chow Date: Mon, 4 May 2026 21:22:17 -0700 Subject: [PATCH 08/41] fix(sync): handle detached HEAD by skipping pull and ingesting local working tree --- src/commands/sync.ts | 53 ++++++++++++++++++++++++++-- test/sync.test.ts | 84 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 3 deletions(-) diff --git a/src/commands/sync.ts b/src/commands/sync.ts index 8d76e82fc..8952f9cdc 100644 --- a/src/commands/sync.ts +++ b/src/commands/sync.ts @@ -196,6 +196,33 @@ function git(repoPath: string, ...args: string[]): string { }).trim(); } +function isDetachedHead(repoPath: string): boolean { + try { + git(repoPath, 'symbolic-ref', '--quiet', 'HEAD'); + return false; + } catch { + return true; + } +} + +function unique(items: T[]): T[] { + return [...new Set(items)]; +} + +function buildDetachedWorkingTreeManifest(repoPath: string): SyncManifest { + const manifest = buildSyncManifest(git(repoPath, 'diff', '--name-status', '-M', 'HEAD')); + const untracked = git(repoPath, 'ls-files', '--others', '--exclude-standard') + .split('\n') + .filter(line => line.length > 0); + + return { + added: unique([...manifest.added, ...untracked]), + modified: unique(manifest.modified), + deleted: unique(manifest.deleted), + renamed: manifest.renamed, + }; +} + // v0.18.0 Step 5: source-scoped sync state helpers. When opts.sourceId // is set, read/write the per-source row instead of the global config // keys. These wrappers centralize the branch so every read/write site @@ -375,13 +402,21 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise 0 || + detachedWorkingTreeManifest.modified.length > 0 || + detachedWorkingTreeManifest.deleted.length > 0 || + detachedWorkingTreeManifest.renamed.length > 0); + + if (lastCommit === headCommit && !versionMismatch && !versionNeverSet && !hasDetachedWorkingTreeChanges) { return { status: 'up_to_date', fromCommit: lastCommit, @@ -466,6 +507,12 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise { // Structural assertion: the contract includes `embedded: number`. expect(typeof result.embedded).toBe('number'); }); + + test('detached HEAD skips git pull and ingests local working-tree files', async () => { + const { performSync } = await import('../src/commands/sync.ts'); + const seeded = await performSync(engine, { + repoPath, + noPull: true, + noEmbed: true, + noExtract: true, + }); + expect(seeded.status).toBe('first_sync'); + + execSync('git checkout --detach HEAD', { cwd: repoPath, stdio: 'pipe' }); + writeFileSync(join(repoPath, 'people/detached-local.md'), [ + '---', + 'type: person', + 'title: Detached Local', + '---', + '', + 'This file exists only in the detached working tree.', + ].join('\n')); + + const errors: string[] = []; + const originalError = console.error; + console.error = (...args: unknown[]) => { + errors.push(args.map(String).join(' ')); + }; + + try { + const result = await performSync(engine, { + repoPath, + noEmbed: true, + noExtract: true, + }); + + expect(result.status).toBe('synced'); + expect(result.added).toBe(1); + expect(result.pagesAffected).toContain('people/detached-local'); + } finally { + console.error = originalError; + } + + expect(errors.join('\n')).toContain(`Detached HEAD on ${repoPath}; skipping git pull. Syncing from local working tree.`); + expect(errors.join('\n')).not.toContain('git pull failed'); + + const page = await engine.getPage('people/detached-local'); + expect(page).not.toBeNull(); + expect(page!.title).toBe('Detached Local'); + }); + + test('detached HEAD with --no-pull also ingests local working-tree files', async () => { + const { performSync } = await import('../src/commands/sync.ts'); + const seeded = await performSync(engine, { + repoPath, + noPull: true, + noEmbed: true, + noExtract: true, + }); + expect(seeded.status).toBe('first_sync'); + + execSync('git checkout --detach HEAD', { cwd: repoPath, stdio: 'pipe' }); + writeFileSync(join(repoPath, 'people/detached-nopull.md'), [ + '---', + 'type: person', + 'title: Detached NoPull', + '---', + '', + 'Only in detached working tree, --no-pull caller.', + ].join('\n')); + + const result = await performSync(engine, { + repoPath, + noPull: true, + noEmbed: true, + noExtract: true, + }); + + expect(result.status).toBe('synced'); + expect(result.added).toBe(1); + expect(result.pagesAffected).toContain('people/detached-nopull'); + + const page = await engine.getPage('people/detached-nopull'); + expect(page).not.toBeNull(); + expect(page!.title).toBe('Detached NoPull'); + }); }); describe('sync regression — #132 nested transaction deadlock', () => { From 9919416af8069c99aae2e29d55e2d457590886b7 Mon Sep 17 00:00:00 2001 From: Brandon Lipman Date: Wed, 6 May 2026 13:35:30 -0400 Subject: [PATCH 09/41] fix(sync): --skip-failed acks pre-existing unacked failures up-front MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recovery flow that doctor + printSyncResult both advertise was broken: 1. User has files with bad YAML → they hit the failure log + sync stays blocked at last_commit. 2. User fixes the YAML. 3. User re-runs `gbrain sync` — sync succeeds, advances last_commit. 4. `gbrain doctor` still reports N unacked failures from step 1 because sync-failures.jsonl is append-only history, never auto-cleared. 5. doctor message says: "use 'gbrain sync --skip-failed' to acknowledge". 6. User runs `gbrain sync --skip-failed` → "Already up to date." → log unchanged. The bug: --skip-failed only acknowledges failures from the CURRENT run. performSync's ack path is gated on `failedFiles.length > 0` after sync — it never fires when the diff is empty (because the user already fixed the bad files) or when the sync is up to date. So the documented recovery sequence is a no-op exactly when the user needs it. The fix: at the top of runSync, when --skip-failed is set, eagerly ack any pre-existing unacked failures before any sync work runs. Now the flag means "acknowledge whatever is currently flagged and move on" regardless of whether the current run produces new failures or finds nothing to do. The inner per-run ack path stays — it still handles new failures from the CURRENT run, which is the (a) syncing now produces failures + (b) caller wants to ack them path. The two paths compose: `gbrain sync --skip-failed` clears stale + advances past anything new, all in one command, matching what the doctor message promises. Tests: 2 added in test/sync-failures.test.ts. One source-string pin on the new gate (the file's existing pattern for CLI-flag tests). One behavioral test on the underlying acknowledgeSyncFailures path. Repro: $ gbrain doctor [WARN] sync_failures: 27 unacknowledged sync failure(s)... Fix the file(s) and re-run 'gbrain sync', or use 'gbrain sync --skip-failed' to acknowledge. $ # ... fix the YAML ... $ gbrain sync Already up to date. $ gbrain sync --skip-failed Already up to date. # before this PR $ gbrain doctor [WARN] sync_failures: 27 unacknowledged sync failure(s)... # still! After: $ gbrain sync --skip-failed Acknowledged 27 pre-existing failure(s). Already up to date. $ gbrain doctor [OK] sync_failures: N historical sync failure(s), all acknowledged --- src/commands/sync.ts | 16 ++++++++++++++++ test/sync-failures.test.ts | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/src/commands/sync.ts b/src/commands/sync.ts index 8952f9cdc..0c6f8c072 100644 --- a/src/commands/sync.ts +++ b/src/commands/sync.ts @@ -1035,6 +1035,22 @@ export async function runSync(engine: BrainEngine, args: string[]) { process.exit(1); } + // --skip-failed: acknowledge pre-existing unacked failures BEFORE the sync + // runs, not only ones the current run produces. Without this, the common + // recovery flow — fix the YAML, re-run sync, then run --skip-failed to + // clear the log — fails to clear anything: when there are no NEW failures + // (because the files are now fixed), the inner ack path in performSync is + // never reached, and "Already up to date." leaves the log untouched. Both + // doctor and printSyncResult instruct users to run --skip-failed in + // exactly this case, so the flag has to handle stale entries up-front. + if (skipFailed) { + const stale = unacknowledgedSyncFailures(); + if (stale.length > 0) { + const acked = acknowledgeSyncFailures(); + console.log(`Acknowledged ${acked.count} pre-existing failure(s).`); + } + } + // v0.18.0 Step 5: --source resolves to a sources(id) row. Falls back // to pre-v0.17 global config (sync.repo_path + sync.last_commit) when // no flag, no env, no dotfile is present. diff --git a/test/sync-failures.test.ts b/test/sync-failures.test.ts index ea66b889f..4e8fe89a3 100644 --- a/test/sync-failures.test.ts +++ b/test/sync-failures.test.ts @@ -140,6 +140,35 @@ describe('Bug 9 — sync.ts CLI flag wiring', () => { expect(source).toContain('retryFailed'); }); + test('runSync acks pre-existing unacked failures up-front when --skip-failed is set', async () => { + // Without this gate, a user who fixes their broken YAML, re-runs sync + // (which finds nothing new and prints "Already up to date."), and then + // runs `gbrain sync --skip-failed` to clear the log gets a no-op — + // performSync's inner ack path only fires when failedFiles.length > 0 + // in the current run. This test pins the up-front ack at the top of + // runSync so the flag means "ack whatever is currently flagged". + const source = await Bun.file(new URL('../src/commands/sync.ts', import.meta.url)).text(); + // Ensure the up-front check exists before the syncAll / performSync + // dispatch, gated on skipFailed. + expect(source).toMatch(/if \(skipFailed\) \{[\s\S]*?unacknowledgedSyncFailures\(\)[\s\S]*?acknowledgeSyncFailures\(\)/); + }); + + test('acknowledgeSyncFailures clears stale failures end-to-end', async () => { + // Behavioral pin: the helper that --skip-failed delegates to must + // clear failures regardless of any current-run state. Mirrors the + // recovery flow: file fixed → sync clean → user wants log cleared. + const { recordSyncFailures, acknowledgeSyncFailures, unacknowledgedSyncFailures } = await import('../src/core/sync.ts'); + recordSyncFailures([ + { path: 'people/old-broken.md', error: 'YAML: bad block mapping' }, + { path: 'people/old-broken.md', error: 'YAML: bad block mapping' }, // dup, dedup'd by recordSyncFailures + { path: 'meetings/stale.md', error: 'YAML: multiline key' }, + ], 'old-commit'); + expect(unacknowledgedSyncFailures().length).toBe(2); + const result = acknowledgeSyncFailures(); + expect(result.count).toBe(2); + expect(unacknowledgedSyncFailures().length).toBe(0); + }); + test('performSync gates sync.last_commit on failedFiles.length', async () => { const source = await Bun.file(new URL('../src/commands/sync.ts', import.meta.url)).text(); // The gate exists and references the failure set. From da5c9a248def8e7afc405a7e36e2fd774be07832 Mon Sep 17 00:00:00 2001 From: Brandon Lipman Date: Wed, 6 May 2026 14:40:54 -0400 Subject: [PATCH 10/41] fix(extract): default --dir to configured brain dir, not cwd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `gbrain extract links` (and timeline / all) defaulted --dir to '.' when not explicitly passed (src/commands/extract.ts:357). Combined with a walker that skips dotfiles but NOT node_modules/dist/build/vendor, this turned a no-arg invocation into a footgun. Repro: $ cd ~/Documents/some-project # has a node_modules/ tree $ gbrain extract links [extract.links_fs] 28989/28989 (100%) done Links: created 0 from 28989 pages Done: 0 links, 0 timeline entries from 28989 pages The "28989 pages" is `walkMarkdownFiles('.')` recursively eating package READMEs, dependency docs, fixture content. Their from_slug doesn't match any row in the pages table, so addLinksBatch rejects every insert and returns 0. Output looks like a healthy idempotent no-op; was actually a wasteful junk walk that wrote nothing. Fix: when --dir is not passed AND source is fs, resolve from sources(local_path) via getDefaultSourcePath — same helper sync uses (src/commands/sync.ts:1089). The default behavior now matches `sync`: "work on the configured brain". Falls back to a clear error when no source is configured, telling the user to either pass --dir, register a source, or use --source db. Behavior matrix: --dir explicit → use that path (unchanged) --dir absent + cfg → resolve from sources(local_path) --dir absent + no → error with actionable hint (was: walk cwd silently) --dir . → cwd (user opted in explicitly — unchanged) Tests: three added in test/extract-fs.test.ts: 1. configured source → no-arg invocation extracts from that path 2. no source configured → exit 1 + actionable error message 3. explicit --dir wins over a configured (decoy) source path --- src/commands/extract.ts | 34 +++++++++++++++- test/extract-fs.test.ts | 87 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 9b9d08baa..d2539ca4e 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -354,7 +354,19 @@ export async function runExtractCore(engine: BrainEngine, opts: ExtractOpts): Pr export async function runExtract(engine: BrainEngine, args: string[]) { const subcommand = args[0]; const dirIdx = args.indexOf('--dir'); - const brainDir = (dirIdx >= 0 && dirIdx + 1 < args.length) ? args[dirIdx + 1] : '.'; + const explicitDir = dirIdx >= 0 && dirIdx + 1 < args.length; + // When --dir is not passed, resolve from the configured brain source + // BEFORE falling back to '.' (the prior default). The bare `.` default was + // a footgun: a user who runs `gbrain extract links` from anywhere outside + // their brain dir (e.g., a project checkout with a node_modules tree) had + // the recursive walker grab tens of thousands of unrelated .md files, + // attempt to extract links between them, then write 0 rows because the + // synthetic from_slugs don't match any pages row. The output ("created 0 + // links from 28989 pages") looks like a no-op, but it walked 28K junk files + // first. Resolving from sources(local_path) makes the no-arg invocation + // match what `gbrain sync` already does, and keeps cwd-cwd usage available + // via explicit `--dir .`. + let brainDir = explicitDir ? args[dirIdx + 1] : '.'; const sourceIdx = args.indexOf('--source'); const source = (sourceIdx >= 0 && sourceIdx + 1 < args.length) ? args[sourceIdx + 1] : 'fs'; const typeIdx = args.indexOf('--type'); @@ -390,7 +402,25 @@ export async function runExtract(engine: BrainEngine, args: string[]) { process.exit(1); } - // FS source needs a brain dir; DB source ignores --dir. + // FS source needs a brain dir. When --dir wasn't passed, resolve from + // sources(local_path) — same path `gbrain sync` uses — instead of + // silently walking cwd. See the brainDir comment above for the footgun. + if (source === 'fs' && !explicitDir) { + const { getDefaultSourcePath } = await import('../core/source-resolver.ts'); + const configured = await getDefaultSourcePath(engine); + if (configured) { + brainDir = configured; + } else { + console.error( + `No brain directory configured. Pass --dir explicitly, or use --source db ` + + `to extract from already-synced pages. To register a brain dir as the default, ` + + `run: gbrain sources add default --path `, + ); + process.exit(1); + } + } + + // DB source ignores --dir. if (source === 'fs' && !existsSync(brainDir)) { console.error(`Directory not found: ${brainDir}`); process.exit(1); diff --git a/test/extract-fs.test.ts b/test/extract-fs.test.ts index 97bbbcfa5..04f97dd82 100644 --- a/test/extract-fs.test.ts +++ b/test/extract-fs.test.ts @@ -159,3 +159,90 @@ title: Alice expect(elapsedMs).toBeLessThan(2000); }); }); + +describe('gbrain extract --dir default resolution', () => { + // Pin the cwd-footgun fix: when --dir is not passed, extract resolves the + // brain dir from the sources(local_path) row before falling back. The bare + // `.` default would let a user running from a directory with a node_modules/ + // tree walk tens of thousands of unrelated .md files and report + // "created 0 links from 28K pages" — looks like a no-op, was actually a + // wasteful junk walk that wrote nothing because synthetic from_slugs don't + // match the pages table. + test('uses configured sources(local_path) when --dir is not passed', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('people/bob', personPage('Bob')); + writeFile('people/alice.md', '---\ntitle: Alice\n---\n\n[Bob](../people/bob.md) is a friend.\n'); + writeFile('people/bob.md', '---\ntitle: Bob\n---\n'); + + // Register brainDir as the default source's local_path. + await (engine as any).db.exec( + `UPDATE sources SET local_path = '${brainDir.replace(/'/g, "''")}' WHERE id = 'default'`, + ); + + // Save + clobber cwd to a sibling tmpdir so the test fails loudly if the + // resolver still walks `.` instead of the configured path. + const otherDir = mkdtempSync(join(tmpdir(), 'gbrain-extract-other-')); + const savedCwd = process.cwd(); + try { + process.chdir(otherDir); + await runExtract(engine, ['links']); // no --dir + } finally { + process.chdir(savedCwd); + try { rmSync(otherDir, { recursive: true, force: true }); } catch { /* ignore */ } + } + + const links = await engine.getLinks('people/alice'); + expect(links.length).toBe(1); + expect(links[0]).toMatchObject({ to_slug: 'people/bob' }); + }); + + test('errors with actionable message when no --dir and no source configured', async () => { + // Clear the default source's local_path so getDefaultSourcePath returns null. + await (engine as any).db.exec(`UPDATE sources SET local_path = NULL WHERE id = 'default'`); + await (engine as any).db.exec(`DELETE FROM config WHERE key = 'sync.repo_path'`); + + let exitCode: number | null = null; + const errBuf: string[] = []; + const savedExit = process.exit; + const savedConsoleError = console.error; + try { + (process as any).exit = (code: number) => { exitCode = code; throw new Error('__test_exit__'); }; + console.error = (...parts: unknown[]) => { errBuf.push(parts.join(' ')); }; + try { + await runExtract(engine, ['links']); + } catch (e) { + if (!(e instanceof Error && e.message === '__test_exit__')) throw e; + } + } finally { + (process as any).exit = savedExit; + console.error = savedConsoleError; + } + expect(exitCode).toBe(1); + const all = errBuf.join('\n'); + expect(all).toContain('No brain directory configured'); + expect(all).toContain('--source db'); + expect(all).toContain('--dir'); + }); + + test('explicit --dir always wins over configured source', async () => { + await engine.putPage('people/alice', personPage('Alice')); + await engine.putPage('people/bob', personPage('Bob')); + writeFile('people/alice.md', '---\ntitle: Alice\n---\n\n[Bob](../people/bob.md) is a friend.\n'); + writeFile('people/bob.md', '---\ntitle: Bob\n---\n'); + + // Configured path points elsewhere; explicit --dir must override. + const decoyDir = mkdtempSync(join(tmpdir(), 'gbrain-extract-decoy-')); + await (engine as any).db.exec( + `UPDATE sources SET local_path = '${decoyDir.replace(/'/g, "''")}' WHERE id = 'default'`, + ); + + try { + await runExtract(engine, ['links', '--dir', brainDir]); + const links = await engine.getLinks('people/alice'); + expect(links.length).toBe(1); + expect(links[0]).toMatchObject({ to_slug: 'people/bob' }); + } finally { + try { rmSync(decoyDir, { recursive: true, force: true }); } catch { /* ignore */ } + } + }); +}); From 1ad43c2b2974d2d077735d724295a82b5ca615b9 Mon Sep 17 00:00:00 2001 From: Federico Cachero Date: Thu, 7 May 2026 20:21:03 -0300 Subject: [PATCH 11/41] fix(extract): normalize slugs to lowercase via pathToSlug() (T-OBS-1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extractor was generating from_slug and the allSlugs lookup set from `relPath.replace('.md', '')` in 5 places, producing CAPS slugs for files named ETHOS.md, AGENTS.md, ROADMAP.md, etc. Pages persist in the DB with lowercase slug (core/sync.ts pathToSlug() applies .toLowerCase()). The CAPS extractor output mismatched the DB rows, so INSERT ... JOIN pages ON pages.slug = v.from_slug silently dropped links from CAPS-named source files. The link batch returned 'inserted' counts that were lower than the wikilinks actually present, with no error. Reproduction (in a brain with CAPS-named canonical docs): 1. echo 'See [agents](agents.md).' > ETHOS.md 2. gbrain put ethos < ETHOS.md # page row: slug='ethos' 3. gbrain extract links --source fs 4. gbrain backlinks agents → [] (expected: contains 'ethos') Fix: import pathToSlug from core/sync.ts and use it in all 5 sites: - extractLinksFromFile (line 200): from_slug derivation - runIncrementalExtractInternal (line 456): allSlugs set - extractLinksFromDir (line 552): allSlugs set - timeline loop (line 643): from_slug for timeline entries - extractLinksForSlugs (line 673): allSlugs set used by sync hook This single-line-per-site change keeps the extractor consistent with the sync layer's slug normalization and doesn't introduce any new behavior for already-lowercase paths (idempotent). Tests: added 'extractLinksFromFile — slug normalization (T-OBS-1 regression)' suite with 4 cases covering CAPS, mixed-case, idempotent lowercase, and nested path. Full extract suite (54 → 58 tests) passes. Reported by Claude Code (Opus 4.7) during Obsidian PKM integration on the gstack-plan Living Repo, where ~111 wikilinks pointing to ETHOS, AGENTS, ROADMAP, etc. failed to count toward brain_score (54/100 vs expected 75+/100). Documented as T-OBS-1 in the consumer's blocked.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/commands/extract.ts | 11 ++++----- test/extract.test.ts | 49 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/src/commands/extract.ts b/src/commands/extract.ts index d2539ca4e..227d2fba3 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -28,6 +28,7 @@ import { } from '../core/link-extraction.ts'; import { createProgress } from '../core/progress.ts'; import { getCliOptions, cliOptsToProgressOptions } from '../core/cli-options.ts'; +import { pathToSlug } from '../core/sync.ts'; // Batch size for addLinksBatch / addTimelineEntriesBatch. // Postgres bind-parameter limit is 65535. Links use 4 cols/row → 16K hard ceiling; @@ -197,7 +198,7 @@ export async function extractLinksFromFile( opts?: { includeFrontmatter?: boolean }, ): Promise { const links: ExtractedLink[] = []; - const slug = relPath.replace('.md', ''); + const slug = pathToSlug(relPath); const fileDir = dirname(relPath); const fm = parseFrontmatterFromContent(content, relPath); @@ -483,7 +484,7 @@ async function extractForSlugs( ): Promise<{ links_created: number; timeline_created: number; pages: number }> { // Build the full slug set for link resolution (fast: just readdir, no file reads) const allFiles = walkMarkdownFiles(brainDir); - const allSlugs = new Set(allFiles.map(f => f.relPath.replace('.md', ''))); + const allSlugs = new Set(allFiles.map(f => pathToSlug(f.relPath))); const doLinks = mode === 'links' || mode === 'all'; const doTimeline = mode === 'timeline' || mode === 'all'; @@ -579,7 +580,7 @@ async function extractLinksFromDir( engine: BrainEngine, brainDir: string, dryRun: boolean, jsonMode: boolean, ): Promise<{ created: number; pages: number }> { const files = walkMarkdownFiles(brainDir); - const allSlugs = new Set(files.map(f => f.relPath.replace('.md', ''))); + const allSlugs = new Set(files.map(f => pathToSlug(f.relPath))); // Progress stream on stderr (separate from the action-events --json writes // to stdout, which tests grep for). Rate-gated; respects global --quiet / @@ -670,7 +671,7 @@ async function extractTimelineFromDir( for (let i = 0; i < files.length; i++) { try { const content = readFileSync(files[i].path, 'utf-8'); - const slug = files[i].relPath.replace('.md', ''); + const slug = pathToSlug(files[i].relPath); for (const entry of extractTimelineFromContent(content, slug)) { if (dryRunSeen) { const key = `${entry.slug}::${entry.date}::${entry.summary}`; @@ -700,7 +701,7 @@ async function extractTimelineFromDir( export async function extractLinksForSlugs(engine: BrainEngine, repoPath: string, slugs: string[]): Promise { const allFiles = walkMarkdownFiles(repoPath); - const allSlugs = new Set(allFiles.map(f => f.relPath.replace('.md', ''))); + const allSlugs = new Set(allFiles.map(f => pathToSlug(f.relPath))); let created = 0; for (const slug of slugs) { const filePath = join(repoPath, slug + '.md'); diff --git a/test/extract.test.ts b/test/extract.test.ts index 4782b3ddf..1c9b15f14 100644 --- a/test/extract.test.ts +++ b/test/extract.test.ts @@ -142,3 +142,52 @@ describe('walkMarkdownFiles', () => { expect(typeof walkMarkdownFiles).toBe('function'); }); }); + +describe('extractLinksFromFile — slug normalization (T-OBS-1 regression)', () => { + // Regression coverage for the bug where CAPS-named files (ETHOS.md, AGENTS.md) + // generated CAPS slugs from `relPath.replace('.md', '')` while the DB stores + // pages.slug lowercase via pathToSlug() in core/sync.ts. The mismatch caused + // INSERT ... JOIN pages ON pages.slug = v.from_slug to silently drop links. + // Fix: extractor now uses pathToSlug() consistently for from_slug AND allSlugs. + + it('lowercases from_slug when relPath has CAPS filename', async () => { + // Note: link targets are kept lowercase (the convention used by the + // wikilink migration); this test focuses on from_slug derivation. + const content = 'See [agents](agents.md) for the matrix.'; + const allSlugs = new Set(['ethos', 'agents']); + const links = await extractLinksFromFile(content, 'ETHOS.md', allSlugs); + expect(links.length).toBeGreaterThanOrEqual(1); + // Critical: from_slug must be lowercase regardless of the source file casing. + expect(links[0].from_slug).toBe('ethos'); + }); + + it('lowercases from_slug for mixed-case filename', async () => { + const content = 'Reference [hermes](hermes_nest.md).'; + const allSlugs = new Set(['hermes_nest', 'foo']); + const links = await extractLinksFromFile(content, 'Foo.md', allSlugs); + expect(links.length).toBeGreaterThanOrEqual(1); + expect(links[0].from_slug).toBe('foo'); + }); + + it('is idempotent for already-lowercase filenames', async () => { + const content = 'See [bar](bar.md).'; + const allSlugs = new Set(['foo', 'bar']); + const links = await extractLinksFromFile(content, 'foo.md', allSlugs); + expect(links.length).toBeGreaterThanOrEqual(1); + expect(links[0].from_slug).toBe('foo'); + }); + + it('lowercases nested path slug with mixed-case segment', async () => { + // relPath has mixed-case directory + filename. Link target is in the same + // directory (no .. traversal) so resolveSlug can hit allSlugs cleanly. + const content = 'See [other](other.md).'; + const allSlugs = new Set(['decisions/0001-living-repo-pattern', 'decisions/other']); + const links = await extractLinksFromFile( + content, + 'decisions/0001-Living-Repo-Pattern.md', + allSlugs, + ); + expect(links.length).toBeGreaterThanOrEqual(1); + expect(links[0].from_slug).toBe('decisions/0001-living-repo-pattern'); + }); +}); From a6c804c1cc958b2645c52a6e9c9fdbecaec253ab Mon Sep 17 00:00:00 2001 From: Trevin Chow Date: Mon, 4 May 2026 21:15:46 -0700 Subject: [PATCH 12/41] fix(cli): CLI_ONLY commands should short-circuit on --help instead of executing --- src/cli.ts | 25 +++++++++++++- test/cli.test.ts | 87 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 106 insertions(+), 6 deletions(-) diff --git a/src/cli.ts b/src/cli.ts index 18c9fa273..745e1c821 100755 --- a/src/cli.ts +++ b/src/cli.ts @@ -25,6 +25,15 @@ for (const op of operations) { // CLI-only commands that bypass the operation layer const CLI_ONLY = new Set(['init', 'upgrade', 'post-upgrade', 'check-update', 'integrations', 'publish', 'check-backlinks', 'lint', 'report', 'import', 'export', 'files', 'embed', 'serve', 'call', 'config', 'doctor', 'migrate', 'eval', 'sync', 'extract', 'features', 'autopilot', 'graph-query', 'jobs', 'agent', 'apply-migrations', 'skillpack-check', 'skillpack', 'resolvers', 'integrity', 'repair-jsonb', 'orphans', 'sources', 'mounts', 'dream', 'check-resolvable', 'routing-eval', 'skillify', 'smoke-test', 'providers', 'storage', 'repos', 'code-def', 'code-refs', 'reindex-code', 'reindex-frontmatter', 'code-callers', 'code-callees', 'frontmatter', 'auth', 'friction', 'claw-test', 'book-mirror', 'takes', 'think', 'salience', 'anomalies', 'transcripts', 'remote']); +// CLI-only commands whose handlers print their own --help text. These are +// excluded from the generic short-circuit so detailed per-command and +// per-subcommand usage stays reachable. +const CLI_ONLY_SELF_HELP = new Set([ + 'upgrade', 'post-upgrade', 'check-update', + 'embed', 'config', + 'skillpack', 'skillpack-check', + 'integrations', 'friction', +]); async function main() { // Parse global flags (--quiet / --progress-json / --progress-interval) @@ -60,12 +69,16 @@ async function main() { } // Per-command --help - if (subArgs.includes('--help') || subArgs.includes('-h')) { + if (hasHelpFlag(subArgs)) { const op = cliOps.get(command); if (op) { printOpHelp(op); return; } + if (CLI_ONLY.has(command) && !CLI_ONLY_SELF_HELP.has(command)) { + printCliOnlyHelp(command); + return; + } } // CLI-only commands @@ -138,6 +151,16 @@ async function main() { } } +function hasHelpFlag(args: string[]): boolean { + return args.includes('--help') || args.includes('-h'); +} + +function printCliOnlyHelp(command: string) { + console.log(`Usage: gbrain ${command}`); + console.log(''); + console.log(`gbrain ${command} - run gbrain --help for the full command list.`); +} + /** * v0.27.1: shared transform for `gbrain query --image ` (and any future * CLI surface that takes an image path). Reads the file, base64-encodes, diff --git a/test/cli.test.ts b/test/cli.test.ts index f381f9215..e85da900b 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -1,8 +1,22 @@ import { describe, test, expect } from 'bun:test'; -import { readFileSync } from 'fs'; +import { existsSync, mkdtempSync, readFileSync, rmSync } from 'fs'; +import { tmpdir } from 'os'; +import { join } from 'path'; // Read cli.ts source for structural checks const cliSource = readFileSync(new URL('../src/cli.ts', import.meta.url), 'utf-8'); +const repoRoot = new URL('..', import.meta.url).pathname; + +function isolatedEnv(home: string): Record { + const env: Record = {}; + for (const [key, value] of Object.entries(process.env)) { + if (value !== undefined) env[key] = value; + } + delete env.GBRAIN_DATABASE_URL; + delete env.DATABASE_URL; + env.GBRAIN_HOME = home; + return env; +} describe('CLI structure', () => { test('imports operations from operations.ts', () => { @@ -86,7 +100,7 @@ describe('CLI dispatch integration', () => { test('per-command --help prints usage without DB connection', async () => { const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', 'get', '--help'], { - cwd: new URL('..', import.meta.url).pathname, + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', }); @@ -98,7 +112,7 @@ describe('CLI dispatch integration', () => { test('upgrade --help prints usage without running upgrade', async () => { const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', 'upgrade', '--help'], { - cwd: new URL('..', import.meta.url).pathname, + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', }); @@ -108,9 +122,72 @@ describe('CLI dispatch integration', () => { expect(exitCode).toBe(0); }); + test('sync --help short-circuits CLI-only dispatch without running sync', async () => { + const home = mkdtempSync(join(tmpdir(), 'gbrain-cli-help-')); + try { + const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', 'sync', '--help'], { + cwd: repoRoot, + stdout: 'pipe', + stderr: 'pipe', + env: isolatedEnv(home), + }); + const stdout = await new Response(proc.stdout).text(); + const stderr = await new Response(proc.stderr).text(); + const exitCode = await proc.exited; + expect(stdout).toContain('Usage: gbrain sync'); + expect(stdout).toContain('run gbrain --help for the full command list'); + expect(stdout).not.toContain('Already up to date.'); + expect(stderr).not.toContain('Already up to date.'); + expect(existsSync(join(home, '.gbrain', 'config.json'))).toBe(false); + expect(exitCode).toBe(0); + } finally { + rmSync(home, { recursive: true, force: true }); + } + }); + + test('doctor --help short-circuits CLI-only dispatch without diagnostics', async () => { + const home = mkdtempSync(join(tmpdir(), 'gbrain-cli-help-')); + try { + const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', 'doctor', '--help'], { + cwd: repoRoot, + stdout: 'pipe', + stderr: 'pipe', + env: isolatedEnv(home), + }); + const stdout = await new Response(proc.stdout).text(); + const stderr = await new Response(proc.stderr).text(); + const exitCode = await proc.exited; + expect(stdout).toContain('Usage: gbrain doctor'); + expect(stdout).not.toContain('resolver_health'); + expect(stderr).not.toContain('No brain configured'); + expect(exitCode).toBe(0); + } finally { + rmSync(home, { recursive: true, force: true }); + } + }); + + test('init --help short-circuits CLI-only dispatch without writing config', async () => { + const home = mkdtempSync(join(tmpdir(), 'gbrain-cli-help-')); + try { + const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', 'init', '--help'], { + cwd: repoRoot, + stdout: 'pipe', + stderr: 'pipe', + env: isolatedEnv(home), + }); + const stdout = await new Response(proc.stdout).text(); + const exitCode = await proc.exited; + expect(stdout).toContain('Usage: gbrain init'); + expect(existsSync(join(home, '.gbrain', 'config.json'))).toBe(false); + expect(exitCode).toBe(0); + } finally { + rmSync(home, { recursive: true, force: true }); + } + }); + test('--help prints global help', async () => { const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', '--help'], { - cwd: new URL('..', import.meta.url).pathname, + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', }); @@ -123,7 +200,7 @@ describe('CLI dispatch integration', () => { test('--tools-json outputs valid JSON with operations', async () => { const proc = Bun.spawn(['bun', 'run', 'src/cli.ts', '--tools-json'], { - cwd: new URL('..', import.meta.url).pathname, + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', }); From d832f2bb0dfa973597ce87d1e24cc7892d47d945 Mon Sep 17 00:00:00 2001 From: Brandon Lipman Date: Wed, 6 May 2026 14:38:11 -0400 Subject: [PATCH 13/41] fix(doctor): correct command syntax in graph_coverage warn message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit graph_coverage warn directs users to run `gbrain link-extract && gbrain timeline-extract`, but no commands by those names are registered in cli.ts. The actual commands are `gbrain extract links` and `gbrain extract timeline` (registered as the 'extract' subcommand at src/cli.ts:525, with the kind argument 'links' / 'timeline' / 'all' parsed inside src/commands/extract.ts). A user who runs the suggested command gets: $ gbrain link-extract Unknown command: link-extract This is the only place in src/ with the wrong syntax — the rest of the docs (init.ts:221, init.ts:331, features.ts:120, v0_13_0.ts:67, sync.ts:752 comment) all already say 'extract links'. This patch just brings doctor.ts in line. --- src/commands/doctor.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/commands/doctor.ts b/src/commands/doctor.ts index 6d2b654be..b7db79562 100644 --- a/src/commands/doctor.ts +++ b/src/commands/doctor.ts @@ -875,7 +875,7 @@ export async function runDoctor(engine: BrainEngine | null, args: string[], dbSo checks.push({ name: 'graph_coverage', status: 'warn', - message: `Entity link coverage ${linkPct}%, timeline ${timelinePct}%. Run: gbrain link-extract && gbrain timeline-extract`, + message: `Entity link coverage ${linkPct}%, timeline ${timelinePct}%. Run: gbrain extract links && gbrain extract timeline`, }); } From 58026d1cf1c198f792e73a72bbe9eaf5c739e98b Mon Sep 17 00:00:00 2001 From: Brandon Lipman Date: Wed, 6 May 2026 13:10:15 -0400 Subject: [PATCH 14/41] fix(doctor): use autoDetectSkillsDir so OpenClaw workspaces are reachable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `gbrain doctor` was the only consumer of `findRepoRoot` from `core/repo-root.ts`. Every other consumer (check-resolvable.ts:145, skillify.ts, etc.) uses `autoDetectSkillsDir`, which has the full detection chain: 1. \$OPENCLAW_WORKSPACE 2. ~/.openclaw/workspace 3. findRepoRoot() walk from cwd 4. ./skills `findRepoRoot` only does step 3. Result: when the user runs `gbrain doctor` from any directory outside the gbrain repo or the OpenClaw workspace tree (e.g., a project's checkout), `resolver_health` reports "Could not find skills directory" even though the dispatcher exists at ~/.openclaw/workspace/skills/RESOLVER.md. Reproduces in any directory other than ~/gbrain or its descendants on a system with ~/.openclaw/workspace/skills/RESOLVER.md present: \$ cd ~/Documents \$ gbrain doctor [WARN] resolver_health: Could not find skills directory # before [WARN] resolver_health: 5 issue(s): 0 error(s), 5 warning(s) # after Switching doctor to `autoDetectSkillsDir` brings it inline with the rest of the codebase. The detected dir is also passed to `checkSkillConformance` (step 2 of the resolver_health block), which previously rebuilt the path from `repoRoot` — now uses the same detected path for consistency. All 15 existing tests in test/doctor.test.ts continue to pass. --- src/commands/doctor.ts | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/commands/doctor.ts b/src/commands/doctor.ts index b7db79562..4a763fa95 100644 --- a/src/commands/doctor.ts +++ b/src/commands/doctor.ts @@ -3,7 +3,7 @@ import * as db from '../core/db.ts'; import { LATEST_VERSION, getIdleBlockers } from '../core/migrate.ts'; import { checkResolvable } from '../core/check-resolvable.ts'; import { autoFixDryViolations, type AutoFixReport, type FixOutcome } from '../core/dry-fix.ts'; -import { findRepoRoot } from '../core/repo-root.ts'; +import { autoDetectSkillsDir } from '../core/repo-root.ts'; import { loadCompletedMigrations } from '../core/preferences.ts'; import { compareVersions } from './migrations/index.ts'; import { createProgress, startHeartbeat, type ProgressReporter } from '../core/progress.ts'; @@ -228,9 +228,12 @@ export async function runDoctor(engine: BrainEngine | null, args: string[], dbSo // --- Filesystem checks (always run, no DB needed) --- // 1. Resolver health - const repoRoot = findRepoRoot(); - if (repoRoot) { - const skillsDir = join(repoRoot, 'skills'); + // Use the same auto-detect as `check-resolvable` so doctor sees a + // workspace/skills dir reachable via $OPENCLAW_WORKSPACE or + // ~/.openclaw/workspace, not just a `skills/` walked up from cwd. + const detected = autoDetectSkillsDir(); + const skillsDir = detected.dir; + if (skillsDir) { // --fix: run auto-repair BEFORE checkResolvable so the post-fix scan // reflects the new state. Auto-fix only targets DRY violations today; @@ -268,8 +271,7 @@ export async function runDoctor(engine: BrainEngine | null, args: string[], dbSo } // 2. Skill conformance - if (repoRoot) { - const skillsDir = join(repoRoot, 'skills'); + if (skillsDir) { const conformanceResult = checkSkillConformance(skillsDir); checks.push(conformanceResult); } From 6581804ffa22d8f2eda520ee7d101d26273ed4f6 Mon Sep 17 00:00:00 2001 From: Josh Stein Date: Wed, 6 May 2026 15:10:13 -0700 Subject: [PATCH 15/41] fix(mcp): exit serve process on stdin-close/SIGTERM MCP stdio server was keeping the bun process alive indefinitely after the client disconnected. Over days this accumulated 20+ orphaned gbrain serve processes, all holding the PGLite directory open. Since PGLite is single-writer, this caused write-lock contention that made email-sync fail its 15s per-put timeout: 114 puts x 15s = 28.5min runs with 0 emails written. Now listens for stdin end/close, transport close, and SIGTERM/SIGINT/ SIGHUP; calls engine.disconnect() and exits cleanly. Root cause for the no-gbrain-run-in-50h alert. --- src/mcp/server.ts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/mcp/server.ts b/src/mcp/server.ts index be0ae8ab3..f97b5d4a1 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -40,6 +40,26 @@ export async function startMcpServer(engine: BrainEngine) { const transport = new StdioServerTransport(); await server.connect(transport); + + // Exit cleanly when MCP client disconnects (stdin EOF) or on signals. + // Without this, orphaned serve processes accumulate and contend for the + // PGLite write lock, causing ingest jobs (email-sync) to time out. + let shuttingDown = false; + const shutdown = (reason: string, code = 0) => { + if (shuttingDown) return; + shuttingDown = true; + process.stderr.write(`[gbrain-serve] shutdown: ${reason}\n`); + Promise.resolve(engine.disconnect?.()) + .catch(() => {}) + .finally(() => process.exit(code)); + }; + process.stdin.on('end', () => shutdown('stdin end')); + process.stdin.on('close', () => shutdown('stdin close')); + // @ts-ignore — SDK exposes onclose on transport + transport.onclose = () => shutdown('transport close'); + process.on('SIGTERM', () => shutdown('SIGTERM')); + process.on('SIGINT', () => shutdown('SIGINT')); + process.on('SIGHUP', () => shutdown('SIGHUP')); } // Backward compat: used by `gbrain call` command (trusted local path). From 99af49dcd51b9c20ed59b2702afd30bcbe35aa92 Mon Sep 17 00:00:00 2001 From: Matt Gunnin Date: Thu, 7 May 2026 13:14:52 -0500 Subject: [PATCH 16/41] =?UTF-8?q?fix(skills):=20broaden=20RESOLVER=20trigg?= =?UTF-8?q?ers=20+=201=20ambiguity=20flag=20(37=20misses=20=E2=86=92=200,?= =?UTF-8?q?=20100%=20top-1=20accuracy)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `bun run src/cli.ts routing-eval` was reporting 37 ROUTING_MISS entries across 10 skills whose RESOLVER.md trigger phrases didn't match any of their own routing-eval.jsonl fixture intents. Two distinct causes: 1. Single-phrase triggers in 9 skills under '## Uncategorized' didn't cover the paraphrased fixture variations they're supposed to route. Broadened each trigger cell to a quoted-phrase list that covers the fixtures (5 fixtures per skill on average). 2. The media-ingest row used unquoted prose ('Video, audio, PDF, book, YouTube, screenshot') which extractTriggerPhrases() collapses into one impossible long phrase ('video audio pdf book youtube screenshot') under normalizeText — no fixture intent will ever contain that exact substring. Converted to a quoted phrase list. 3. One fixture ('web research pass on this person') legitimately matches both `perplexity-research` and `data-research` (data-research's trigger row contains "Research"). Marked the fixture `ambiguous_with: ["data-research"]` since the overlap on the keyword 'research' is inherent and expected. Skills with broadened triggers: - voice-note-ingest, article-enrichment, book-mirror, archive-crawler, brain-pdf, academic-verify, concept-synthesis, perplexity-research, strategic-reading, media-ingest Before: 58 cases, 37 misses, ~36% top-1 accuracy After: 58 cases, 0 misses, 100% top-1 accuracy This also clears `gbrain doctor`'s `resolver_health: 37 issue(s)` warning. --- skills/RESOLVER.md | 20 +++++++++---------- skills/perplexity-research/routing-eval.jsonl | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/skills/RESOLVER.md b/skills/RESOLVER.md index f4f81ef95..c3c8318fc 100644 --- a/skills/RESOLVER.md +++ b/skills/RESOLVER.md @@ -28,7 +28,7 @@ This is the dispatcher. Skills are the implementation. **Read the skill file bef | Trigger | Skill | |---------|-------| | User shares a link, article, tweet, or idea | `skills/idea-ingest/SKILL.md` | -| Video, audio, PDF, book, YouTube, screenshot | `skills/media-ingest/SKILL.md` | +| "video", "PDF book", "YouTube", "screenshot", "summarize this book", "ingest this PDF", "process this book", "ingest it into my brain" | `skills/media-ingest/SKILL.md` | | Meeting transcript received | `skills/meeting-ingestion/SKILL.md` | | Generic "ingest this" (auto-routes to above) | `skills/ingest/SKILL.md` | @@ -109,20 +109,20 @@ These apply to ALL brain-writing skills: | Trigger | Skill | |---------|-------| -| "personalized version of this book" | `skills/book-mirror/SKILL.md` | +| "personalized version of this book", "mirror this book", "two-column book", "book to my life", "this book apply to me", "personalized version" | `skills/book-mirror/SKILL.md` | -| "enrich this article" | `skills/article-enrichment/SKILL.md` | +| "enrich this article", "enriching the article", "enrich the article", "enrich brain pages", "batch enrich", "enrich pass" | `skills/article-enrichment/SKILL.md` | -| "strategic reading" | `skills/strategic-reading/SKILL.md` | +| "strategic reading", "read this through the lens", "apply this to my problem", "what can I learn from this", "extract a playbook from this" | `skills/strategic-reading/SKILL.md` | -| "concept synthesis" | `skills/concept-synthesis/SKILL.md` | +| "concept synthesis", "synthesize my concepts", "intellectual map", "find patterns across my notes", "trace idea evolution", "canon vs riff" | `skills/concept-synthesis/SKILL.md` | -| "perplexity research" | `skills/perplexity-research/SKILL.md` | +| "perplexity research", "perplexity-research", "what's new about this", "current state of", "web research pass", "what changed about", "surface new developments" | `skills/perplexity-research/SKILL.md` | -| "crawl my archive" | `skills/archive-crawler/SKILL.md` | +| "crawl my archive", "find gold in my archive", "archive crawler", "scan my dropbox", "mine my old files" | `skills/archive-crawler/SKILL.md` | -| "verify this academic claim" | `skills/academic-verify/SKILL.md` | +| "verify this academic claim", "check this study", "academic verify", "validate citation", "Retraction Watch", "is this study real" | `skills/academic-verify/SKILL.md` | -| "make pdf from brain" | `skills/brain-pdf/SKILL.md` | +| "make pdf from brain", "brain pdf", "convert brain page to pdf", "page as pdf", "export brain page", "publish this page as pdf" | `skills/brain-pdf/SKILL.md` | -| "voice note" | `skills/voice-note-ingest/SKILL.md` | +| "voice note", "voice memo", "audio message", "audio note", "transcribe and file" | `skills/voice-note-ingest/SKILL.md` | diff --git a/skills/perplexity-research/routing-eval.jsonl b/skills/perplexity-research/routing-eval.jsonl index 2755b3e3d..54bc4b6af 100644 --- a/skills/perplexity-research/routing-eval.jsonl +++ b/skills/perplexity-research/routing-eval.jsonl @@ -3,5 +3,5 @@ {"intent":"Run perplexity-research on Brex and surface NEW developments","expected_skill":"perplexity-research","ambiguous_with":["data-research"]} {"intent":"What's new about this company that the brain doesn't already cover","expected_skill":"perplexity-research"} {"intent":"Tell me the current state of the YC W26 batch announcements","expected_skill":"perplexity-research"} -{"intent":"Do a web research pass on this person — focus on the delta","expected_skill":"perplexity-research"} +{"intent":"Do a web research pass on this person — focus on the delta","expected_skill":"perplexity-research","ambiguous_with":["data-research"]} {"intent":"What changed about this funding round since I last looked","expected_skill":"perplexity-research"} From 6cfddb87a9f6f0de8d326d38f03e8d6d2d9936c1 Mon Sep 17 00:00:00 2001 From: Michael Dela Cruz Date: Thu, 7 May 2026 22:47:23 +0900 Subject: [PATCH 17/41] fix(multi-source): thread source_id through per-page tx surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-source brains crashed mid-import with Postgres 21000 ("more than one row returned by a subquery used as an expression"). Root cause: putPage's INSERT column list omitted source_id, so writes intended for a non-default source (e.g. 'jarvis-memory') silently fabricated a duplicate row at (default, slug). The schema has UNIQUE(source_id, slug) but DEFAULT 'default' for source_id; calling putPage(slug, page) without source_id landed at (default, slug) and ON CONFLICT updated the wrong row, leaving the intended source row stale. Subsequent bare-slug subqueries inside the same tx — (SELECT id FROM pages WHERE slug = $1) in getTags / removeTag / deleteChunks / removeLink / addLink (cross-product) — then matched 2 rows and crashed with 21000, rolling back the entire import. Observed: 18 sync failures against a 'jarvis-memory'-sourced brain. Fix: - putPage adds source_id to the INSERT column list (defaults 'default' for back-compat). - Every bare-slug page-id subquery becomes source-qualified (AND source_id = $X) in both engines: createVersion, upsertChunks, getChunks, addTag, removeTag, getTags, deleteChunks, removeLink, addTimelineEntry, deletePage, updateSlug. - addLink rewritten away from FROM pages f, pages t cross-product into a VALUES + JOIN-on-(slug, source_id) shape mirroring addLinksBatch. - engine.ts interface: 11 method signatures gain optional opts.sourceId (or opts.{from,to,origin}SourceId for addLink/removeLink). All optional; existing callers default to source='default' and behave identically. - import-file.ts: importFromContent / importFromFile / importCodeFile take opts.sourceId and thread txOpts = { sourceId } through every per-page tx call. engine.getPage callsite source-scoped for accurate idempotency. - commands/sync.ts: thread opts.sourceId at importFile (line 581 + 641), un-syncable cleanup (487-498), delete phase (557), rename phase (574), and post-sync extract phase (815-816). - commands/reindex-code.ts: thread opts.sourceId at importCodeFile call. - commands/extract.ts: extractLinksForSlugs / extractTimelineForSlugs accept opts.sourceId and propagate via linkOpts / entryOpts. - commands/reconcile-links.ts: ReconcileLinksOpts.sourceId was declared but ignored end-to-end; now wired through getPage + addLink calls. - commands/migrate-engine.ts: --force wipe switched to executeRaw('DELETE FROM pages') to preserve the pre-PR all-sources semantic after deletePage became default-source-scoped. Regression test: test/source-id-tx-regression.test.ts (19 tests). Validates two sources × same slug coexist; getTags/addTag/removeTag/deleteChunks/ upsertChunks/createVersion/addLink/addTimelineEntry/deletePage/updateSlug source-scoped writes don't 21000; back-compat without opts targets source='default'; addLink fail-fast on missing source-qualified endpoint; importFromContent end-to-end tx thread without fabricating duplicate. Adversarial review: Codex (gpt-5.5 reviewer) + Grok (xAI flagship reviewer) 3-round crew loop. Round 1: 2 HIGH (addTimelineEntry + extract.ts thread) + 2 MED. Round 2: 1 CRITICAL + 1 HIGH (deletePage + updateSlug bare-slug) + 2 MED. Round 3: 2 HIGH (getChunks + migrate-engine semantic regression introduced by R2 fix). Round 4: both reviewers CLEAR. Deferred to follow-up PRs (noted as TODO): - src/commands/embed.ts source-aware threading (auto-embed at sync.ts:823 has a TODO; try/catch swallows the failure as best-effort). - src/core/postgres-engine.ts:1511 / pglite-engine.ts:1446 putRawData bare-slug (lower-impact metadata path). - Read-surface bare-slug consistency cleanup (getLinks/getBacklinks/ getTimeline/getRawData/getVersions): non-mutating, won't 21000. - reconcile-links.ts CLI --source flag exposure (internal opt is wired; CLI parser is a UX feature for later). Existing rows in production written under (default, slug) by the old putPage when caller meant another source remain misrouted. Backfill heuristics need install-specific knowledge of intended source and are outside this PR's scope; surface as a deployment-side cleanup task. bun run typecheck clean, bun run build clean, 19/19 regression tests pass, 4082 unit pass / 1 pre-existing fail (BrainRegistry test depending on test-env ~/.gbrain/ absence — fails on untouched main, unrelated). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/commands/extract.ts | 29 +- src/commands/migrate-engine.ts | 12 +- src/commands/reconcile-links.ts | 20 +- src/commands/reindex-code.ts | 1 + src/commands/sync.ts | 49 ++- src/core/engine.ts | 91 +++++- src/core/import-file.ts | 73 +++-- src/core/pglite-engine.ts | 192 +++++++---- src/core/postgres-engine.ts | 155 +++++---- test/source-id-tx-regression.test.ts | 466 +++++++++++++++++++++++++++ 10 files changed, 913 insertions(+), 175 deletions(-) create mode 100644 test/source-id-tx-regression.test.ts diff --git a/src/commands/extract.ts b/src/commands/extract.ts index 227d2fba3..71adff0dd 100644 --- a/src/commands/extract.ts +++ b/src/commands/extract.ts @@ -699,9 +699,21 @@ async function extractTimelineFromDir( // --- Sync integration hooks --- -export async function extractLinksForSlugs(engine: BrainEngine, repoPath: string, slugs: string[]): Promise { +export async function extractLinksForSlugs( + engine: BrainEngine, + repoPath: string, + slugs: string[], + opts?: { sourceId?: string }, +): Promise { const allFiles = walkMarkdownFiles(repoPath); const allSlugs = new Set(allFiles.map(f => pathToSlug(f.relPath))); + // v0.18.0+ multi-source: post-sync extract reconciles same-source edges. + // Markdown→markdown links within one repo always live in the caller's + // sourceId. Cross-source extraction (rare) would need a per-repo source + // manifest; not in this PR's scope. + const linkOpts = opts?.sourceId + ? { fromSourceId: opts.sourceId, toSourceId: opts.sourceId, originSourceId: opts.sourceId } + : undefined; let created = 0; for (const slug of slugs) { const filePath = join(repoPath, slug + '.md'); @@ -709,14 +721,23 @@ export async function extractLinksForSlugs(engine: BrainEngine, repoPath: string try { const content = readFileSync(filePath, 'utf-8'); for (const link of await extractLinksFromFile(content, slug + '.md', allSlugs)) { - try { await engine.addLink(link.from_slug, link.to_slug, link.context, link.link_type); created++; } catch { /* skip */ } + try { await engine.addLink(link.from_slug, link.to_slug, link.context, link.link_type, undefined, undefined, undefined, linkOpts); created++; } catch { /* skip */ } } } catch { /* skip */ } } return created; } -export async function extractTimelineForSlugs(engine: BrainEngine, repoPath: string, slugs: string[]): Promise { +export async function extractTimelineForSlugs( + engine: BrainEngine, + repoPath: string, + slugs: string[], + opts?: { sourceId?: string }, +): Promise { + // v0.18.0+ multi-source: source-qualify so timeline rows don't fan out + // across every source containing the slug (the addTimelineEntry's + // INSERT...SELECT-from-pages fan-out was Data R1's HIGH 2). + const entryOpts = opts?.sourceId ? { sourceId: opts.sourceId } : undefined; let created = 0; for (const slug of slugs) { const filePath = join(repoPath, slug + '.md'); @@ -724,7 +745,7 @@ export async function extractTimelineForSlugs(engine: BrainEngine, repoPath: str try { const content = readFileSync(filePath, 'utf-8'); for (const entry of extractTimelineFromContent(content, slug)) { - try { await engine.addTimelineEntry(entry.slug, { date: entry.date, source: entry.source, summary: entry.summary, detail: entry.detail }); created++; } catch { /* skip */ } + try { await engine.addTimelineEntry(entry.slug, { date: entry.date, source: entry.source, summary: entry.summary, detail: entry.detail }, entryOpts); created++; } catch { /* skip */ } } } catch { /* skip */ } } diff --git a/src/commands/migrate-engine.ts b/src/commands/migrate-engine.ts index 5984d1c90..0d40e43fa 100644 --- a/src/commands/migrate-engine.ts +++ b/src/commands/migrate-engine.ts @@ -117,11 +117,13 @@ export async function runMigrateEngine(sourceEngine: BrainEngine, args: string[] if (targetStats.page_count > 0 && opts.force) { console.log('--force: wiping target brain...'); - // Delete all pages (cascades to chunks, links, tags, etc.) - const pages = await targetEngine.listPages({ limit: 100000 }); - for (const p of pages) { - await targetEngine.deletePage(p.slug); - } + // v0.18.0+ multi-source: deletePage(slug) is now source-scoped (defaults + // to 'default'), so per-page iteration would skip non-default-source + // rows. migrate-engine --force is a destructive wipe across the entire + // brain — all sources, all pages — so we issue a raw DELETE that matches + // the original semantic. Cascades through content_chunks / page_links / + // tags / timeline_entries / page_versions via existing FKs. + await targetEngine.executeRaw('DELETE FROM pages'); } // Load or create manifest for resume diff --git a/src/commands/reconcile-links.ts b/src/commands/reconcile-links.ts index f41e77e2b..c2487bcfe 100644 --- a/src/commands/reconcile-links.ts +++ b/src/commands/reconcile-links.ts @@ -89,8 +89,16 @@ export async function runReconcileLinks( // Fetch pages one at a time via getPage (no bulk read helper exists yet). // On a 47K-page brain this is the slow path; a v0.20.x follow-up can add // getPagesBatch. For the typical 2K–5K markdown count it's fine. + // v0.18.0+ multi-source: source-scope getPage so reconcile picks up the + // intended-source row for `default`-vs-`` ambiguity. The link + // edges below also propagate the same sourceId (Data R1 MED 1: opt was + // declared on ReconcileLinksOpts but ignored end-to-end). + const getPageOpts = opts.sourceId ? { sourceId: opts.sourceId } : undefined; + const linkOpts = opts.sourceId + ? { fromSourceId: opts.sourceId, toSourceId: opts.sourceId, originSourceId: opts.sourceId } + : undefined; for (const mdSlug of mdSlugs) { - const page = await engine.getPage(mdSlug); + const page = await engine.getPage(mdSlug, getPageOpts); if (!page) { progress.tick(1, mdSlug); continue; @@ -113,10 +121,12 @@ export async function runReconcileLinks( const ctx = ref.line ? `cited at ${ref.path}:${ref.line}` : ref.path; edgesAttempted++; try { - // Forward: guide documents code. addLink's inner SELECT drops - // silently if codeSlug isn't a page yet (benign — counted below). - await engine.addLink(mdSlug, codeSlug, ctx, 'documents', 'markdown', mdSlug, 'compiled_truth'); - await engine.addLink(codeSlug, mdSlug, ref.path, 'documented_by', 'markdown', mdSlug, 'compiled_truth'); + // Forward: guide documents code. addLink's inner JOIN drops silently + // if codeSlug isn't a page yet (benign — counted below). Source- + // qualified per opts.sourceId; same-source assumption mirrors the + // import-file.ts:303 doc↔impl auto-link. + await engine.addLink(mdSlug, codeSlug, ctx, 'documents', 'markdown', mdSlug, 'compiled_truth', linkOpts); + await engine.addLink(codeSlug, mdSlug, ref.path, 'documented_by', 'markdown', mdSlug, 'compiled_truth', linkOpts); } catch (e: unknown) { // Per-link errors don't abort the batch. Track them for the summary. const msg = e instanceof Error ? e.message : String(e); diff --git a/src/commands/reindex-code.ts b/src/commands/reindex-code.ts index 713d2c9ed..2fbfc6396 100644 --- a/src/commands/reindex-code.ts +++ b/src/commands/reindex-code.ts @@ -200,6 +200,7 @@ export async function runReindexCode( const result = await importCodeFile(engine, relPath, row.compiled_truth, { noEmbed: opts.noEmbed, force: opts.force, + sourceId: opts.sourceId, }); if (result.status === 'imported') reindexed++; else if (result.status === 'skipped') skipped++; diff --git a/src/commands/sync.ts b/src/commands/sync.ts index 0c6f8c072..7f5c8da81 100644 --- a/src/commands/sync.ts +++ b/src/commands/sync.ts @@ -530,12 +530,16 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise !isSyncable(p, syncOpts)); + // v0.18.0+ multi-source: scope getPage + deletePage to opts.sourceId so + // unsyncable cleanup in source A doesn't accidentally sweep same-slug + // pages in sources B/C/D. + const pageOpts = opts.sourceId ? { sourceId: opts.sourceId } : undefined; for (const path of unsyncableModified) { const slug = resolveSlugForPath(path); try { - const existing = await engine.getPage(slug); + const existing = await engine.getPage(slug, pageOpts); if (existing) { - await engine.deletePage(slug); + await engine.deletePage(slug, pageOpts); console.log(` Deleted un-syncable page: ${slug}`); } } catch { /* ignore */ } @@ -597,11 +601,14 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise 0) { progress.start('sync.deletes', filtered.deleted.length); for (const path of filtered.deleted) { const slug = resolveSlugForPath(path); - await engine.deletePage(slug); + await engine.deletePage(slug, deleteOpts); pagesAffected.push(slug); progress.tick(1, slug); } @@ -614,18 +621,22 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise 0) { progress.start('sync.renames', filtered.renamed.length); + // v0.18.0+ multi-source: scope updateSlug so the rename only touches the + // source-A row, not every same-slug row across sources (which would + // either sweep them all OR violate (source_id, slug) UNIQUE). + const renameOpts = opts.sourceId ? { sourceId: opts.sourceId } : undefined; for (const { from, to } of filtered.renamed) { const oldSlug = resolveSlugForPath(from); const newSlug = resolveSlugForPath(to); try { - await engine.updateSlug(oldSlug, newSlug); + await engine.updateSlug(oldSlug, newSlug, renameOpts); } catch { // Slug doesn't exist or collision, treat as add } // Reimport at new path (picks up content changes) const filePath = join(repoPath, to); if (existsSync(filePath)) { - const result = await importFile(engine, filePath, to, { noEmbed }); + const result = await importFile(engine, filePath, to, { noEmbed, sourceId: opts.sourceId }); if (result.status === 'imported') chunksCreated += result.chunks; } pagesAffected.push(newSlug); @@ -680,7 +691,12 @@ async function performSyncInner(engine: BrainEngine, opts: SyncOpts): Promise 0) { try { const { extractLinksForSlugs, extractTimelineForSlugs } = await import('./extract.ts'); - const linksCreated = await extractLinksForSlugs(engine, repoPath, pagesAffected); - const timelineCreated = await extractTimelineForSlugs(engine, repoPath, pagesAffected); + const linksCreated = await extractLinksForSlugs(engine, repoPath, pagesAffected, extractOpts); + const timelineCreated = await extractTimelineForSlugs(engine, repoPath, pagesAffected, extractOpts); if (linksCreated > 0 || timelineCreated > 0) { console.log(` Extracted: ${linksCreated} links, ${timelineCreated} timeline entries`); } } catch { /* extraction is best-effort */ } } - // Auto-embed (skip for large syncs — embedding calls OpenAI) + // Auto-embed (skip for large syncs — embedding calls OpenAI). + // TODO(multi-source): runEmbed → src/commands/embed.ts:175 + :418 call + // upsertChunks defaulting to source='default'. For non-default-source syncs + // the page row lives at (sourceId, slug) so this fails with "Page not found" + // OR (when a same-slug 'default' row coexists) updates the wrong source's + // chunks. Data R1 MED 2 — deferred to a follow-up PR; threading sourceId + // through embed.ts is a larger refactor than this fix's scope. The current + // try/catch swallows the failure as best-effort, so the sync result still + // reports `embedded: 0` for the right reason. let embedded = 0; if (!noEmbed && pagesAffected.length > 0 && pagesAffected.length <= 100) { try { diff --git a/src/core/engine.ts b/src/core/engine.ts index 4f1a6f774..feb3f2146 100644 --- a/src/core/engine.ts +++ b/src/core/engine.ts @@ -343,7 +343,14 @@ export interface BrainEngine { * by `restore_page` flow, and by operator diagnostics. */ getPage(slug: string, opts?: GetPageOpts): Promise; - putPage(slug: string, page: PageInput): Promise; + /** + * Insert or update a page. When `opts.sourceId` is omitted, the row is + * written under the schema DEFAULT ('default'). When provided, `source_id` + * is included in the INSERT column list so ON CONFLICT (source_id, slug) + * DO UPDATE actually targets the intended row instead of fabricating a + * duplicate at (default, slug). Multi-source brains MUST pass sourceId. + */ + putPage(slug: string, page: PageInput, opts?: { sourceId?: string }): Promise; /** * Hard-delete a page row. Cascades to content_chunks, page_links, * chunk_relations via existing FK ON DELETE CASCADE. @@ -353,7 +360,13 @@ export interface BrainEngine { * as the underlying primitive used by `purgeDeletedPages` and by callers * that explicitly want hard-delete semantics (e.g. test setup teardown). */ - deletePage(slug: string): Promise; + /** + * v0.18.0+ multi-source: `opts.sourceId` scopes the DELETE so a source-A + * delete doesn't hard-delete the same-slug pages in sources B/C/D. Without + * it, the bare DELETE matches every row with that slug across all sources. + * Cascades through content_chunks / page_links / chunk_relations via FKs. + */ + deletePage(slug: string, opts?: { sourceId?: string }): Promise; /** * v0.26.5 — set `deleted_at = now()` on a page. Returns the slug if a row * was soft-deleted, null if no row matched (already soft-deleted OR not found). @@ -392,8 +405,20 @@ export interface BrainEngine { getEmbeddingsByChunkIds(ids: number[]): Promise>; // Chunks - upsertChunks(slug: string, chunks: ChunkInput[]): Promise; - getChunks(slug: string): Promise; + /** + * Replace the chunk set for a page. Internal page-id lookup is sourceId- + * scoped when `opts.sourceId` is given; without it, the schema DEFAULT + * matches and bare-slug lookup blows up if the same slug exists in + * multiple sources (Postgres 21000). + */ + upsertChunks(slug: string, chunks: ChunkInput[], opts?: { sourceId?: string }): Promise; + /** + * Read every chunk for a page. `opts.sourceId` source-scopes the page + * lookup; without it, multi-source brains return chunks from every + * same-slug source (importCodeFile uses this for incremental embedding + * reuse, which would then attach the wrong source's embeddings). + */ + getChunks(slug: string, opts?: { sourceId?: string }): Promise; /** * Count chunks across the entire brain where embedded_at IS NULL. * Pre-flight short-circuit for `embed --stale` so a 100%-embedded brain @@ -409,7 +434,12 @@ export interface BrainEngine { * Bounded by an internal LIMIT of 100000 to mirror listPages. */ listStaleChunks(): Promise; - deleteChunks(slug: string): Promise; + /** + * Delete every chunk for a page. Internal page-id lookup is sourceId-scoped + * when `opts.sourceId` is given; otherwise the bare-slug subquery returns + * the wrong row count in multi-source brains. + */ + deleteChunks(slug: string, opts?: { sourceId?: string }): Promise; // Links /** @@ -417,6 +447,12 @@ export interface BrainEngine { * with pre-v0.13 callers. Pass 'frontmatter' + originSlug + originField for * frontmatter-derived edges; 'manual' for user-initiated edges. */ + /** + * v0.18.0+ multi-source: each endpoint can live in a different source. + * `opts.fromSourceId` / `opts.toSourceId` / `opts.originSourceId` default to + * 'default'. Without these, the original cross-product `FROM pages f, pages t` + * fanned out across every source containing the slug. + */ addLink( from: string, to: string, @@ -425,6 +461,7 @@ export interface BrainEngine { linkSource?: string, originSlug?: string, originField?: string, + opts?: { fromSourceId?: string; toSourceId?: string; originSourceId?: string }, ): Promise; /** * Bulk insert links via a single multi-row INSERT...SELECT FROM (VALUES) JOIN pages @@ -441,7 +478,13 @@ export interface BrainEngine { * 'manual') — used by runAutoLink reconciliation to avoid deleting edges from * other provenances when pruning frontmatter-derived edges. */ - removeLink(from: string, to: string, linkType?: string, linkSource?: string): Promise; + removeLink( + from: string, + to: string, + linkType?: string, + linkSource?: string, + opts?: { fromSourceId?: string; toSourceId?: string }, + ): Promise; getLinks(slug: string): Promise; getBacklinks(slug: string): Promise; /** @@ -519,9 +562,15 @@ export interface BrainEngine { findOrphanPages(): Promise>; // Tags - addTag(slug: string, tag: string): Promise; - removeTag(slug: string, tag: string): Promise; - getTags(slug: string): Promise; + /** + * v0.18.0+ multi-source: `opts.sourceId` scopes the page-id lookup. When + * omitted, the schema DEFAULT 'default' applies; in multi-source brains + * with the same slug across sources the bare-slug lookup returns >1 row + * and the INSERT/DELETE fails with Postgres 21000. + */ + addTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise; + removeTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise; + getTags(slug: string, opts?: { sourceId?: string }): Promise; // Timeline /** @@ -530,10 +579,17 @@ export interface BrainEngine { * known to exist (e.g., from a getAllSlugs() snapshot). Duplicates are silently * deduplicated by the (page_id, date, summary) UNIQUE index (ON CONFLICT DO NOTHING). */ + /** + * Insert a timeline entry. By default verifies the page exists and throws if not. + * `opts.skipExistenceCheck` skips the pre-check for batch loops where the slug + * is already known to exist. `opts.sourceId` source-scopes both the existence + * check AND the page-id lookup inside the INSERT — required for multi-source + * brains where the slug exists in 2+ sources. + */ addTimelineEntry( slug: string, entry: TimelineInput, - opts?: { skipExistenceCheck?: boolean }, + opts?: { skipExistenceCheck?: boolean; sourceId?: string }, ): Promise; /** * Bulk insert timeline entries via a single multi-row INSERT...SELECT FROM (VALUES) @@ -670,7 +726,12 @@ export interface BrainEngine { putDreamVerdict(filePath: string, contentHash: string, verdict: DreamVerdictInput): Promise; // Versions - createVersion(slug: string): Promise; + /** + * Snapshot a page row into page_versions. Source-scoped via `opts.sourceId`; + * without it the bare-slug lookup snapshots whichever row Postgres returns + * first when the slug exists across multiple sources. + */ + createVersion(slug: string, opts?: { sourceId?: string }): Promise; getVersions(slug: string): Promise; revertToVersion(slug: string, versionId: number): Promise; @@ -683,7 +744,13 @@ export interface BrainEngine { getIngestLog(opts?: { limit?: number }): Promise; // Sync - updateSlug(oldSlug: string, newSlug: string): Promise; + /** + * Rename a page's slug (chunks + links + tags + timeline + versions all + * preserved via stable page_id). `opts.sourceId` scopes the UPDATE — without + * it, the bare `WHERE slug = old` matches every row across every source and + * would either rename them all OR violate the (source_id, slug) UNIQUE. + */ + updateSlug(oldSlug: string, newSlug: string, opts?: { sourceId?: string }): Promise; rewriteLinks(oldSlug: string, newSlug: string): Promise; // Config diff --git a/src/core/import-file.ts b/src/core/import-file.ts index 076af7a42..20e6fbf13 100644 --- a/src/core/import-file.ts +++ b/src/core/import-file.ts @@ -188,6 +188,7 @@ export async function importFromContent( content: string, opts: { noEmbed?: boolean; + sourceId?: string; /** * v0.29.1: basename without extension for filename-date precedence on * `daily/`, `meetings/` slugs. importFromFile threads this from the @@ -196,6 +197,12 @@ export async function importFromContent( filename?: string; } = {}, ): Promise { + // v0.18.0+ multi-source: when caller is syncing under a non-default source, + // every per-page tx call must carry `sourceId` so writes target the right + // (source_id, slug) row. Pre-fix, putPage relied on the schema DEFAULT and + // silently fabricated a duplicate at (default, slug) — causing later + // bare-slug subqueries (getTags, deleteChunks, etc.) to crash with 21000. + const sourceId = opts.sourceId; // Reject oversized payloads before any parsing, chunking, or embedding happens. // Uses Buffer.byteLength to count UTF-8 bytes the same way disk size would, // so the network path behaves identically to the file path. @@ -232,7 +239,7 @@ export async function importFromContent( tags: parsed.tags, }; - const existing = await engine.getPage(slug); + const existing = await engine.getPage(slug, sourceId ? { sourceId } : undefined); if (existing?.content_hash === hash) { return { slug, status: 'skipped', chunks: 0, parsedPage }; } @@ -268,9 +275,13 @@ export async function importFromContent( } } - // Transaction wraps all DB writes + // Transaction wraps all DB writes. Every per-page tx call carries the + // caller's sourceId so writes target (sourceId, slug) rather than the + // schema DEFAULT — required for multi-source brains; harmless ('default') + // for single-source callers. + const txOpts = sourceId ? { sourceId } : undefined; await engine.transaction(async (tx) => { - if (existing) await tx.createVersion(slug); + if (existing) await tx.createVersion(slug, txOpts); // v0.29.1 — compute effective_date from frontmatter precedence chain. // Filename comes from importFromFile path (basename) or the slug tail @@ -299,23 +310,23 @@ export async function importFromContent( effective_date: effectiveDate, effective_date_source: effectiveDateSource, import_filename: filenameForChain, - }); + }, txOpts); // Tag reconciliation: remove stale, add current - const existingTags = await tx.getTags(slug); + const existingTags = await tx.getTags(slug, txOpts); const newTags = new Set(parsed.tags); for (const old of existingTags) { - if (!newTags.has(old)) await tx.removeTag(slug, old); + if (!newTags.has(old)) await tx.removeTag(slug, old, txOpts); } for (const tag of parsed.tags) { - await tx.addTag(slug, tag); + await tx.addTag(slug, tag, txOpts); } if (chunks.length > 0) { - await tx.upsertChunks(slug, chunks); + await tx.upsertChunks(slug, chunks, txOpts); } else { // Content is empty — delete stale chunks so they don't ghost in search results - await tx.deleteChunks(slug); + await tx.deleteChunks(slug, txOpts); } // v0.19.0 E1 — doc↔impl linking: if this markdown page cites code paths @@ -325,6 +336,15 @@ export async function importFromContent( // before their code repo syncs are common, and the missing edges land // later via `gbrain reconcile-links` (Layer 8 D3, v0.21.0). const codeRefs = extractCodeRefs(parsed.compiled_truth + '\n' + (parsed.timeline || '')); + // For doc↔impl edges, both endpoints are within the same source as the + // markdown page being imported. Cross-source edges (markdown in one + // source, code in another) currently fail with "page not found" — a + // faster failure mode than the pre-fix cross-product fan-out, which + // silently wired edges to whichever same-slug page Postgres returned + // first across sources. + const linkOpts = sourceId + ? { fromSourceId: sourceId, toSourceId: sourceId, originSourceId: sourceId } + : undefined; for (const ref of codeRefs) { const codeSlug = slugifyCodePath(ref.path); // Forward: markdown guide → code page (this guide documents that code) @@ -333,6 +353,7 @@ export async function importFromContent( slug, codeSlug, ref.line ? `cited at ${ref.path}:${ref.line}` : ref.path, 'documents', 'markdown', slug, 'compiled_truth', + linkOpts, ); } catch { /* code page not yet imported — reconcile-links will catch it */ } // Reverse: code page → markdown guide (this code is documented by the guide) @@ -340,6 +361,7 @@ export async function importFromContent( await tx.addLink( codeSlug, slug, ref.path, 'documented_by', 'markdown', slug, 'compiled_truth', + linkOpts, ); } catch { /* same reason — silent skip */ } } @@ -362,7 +384,7 @@ export async function importFromFile( engine: BrainEngine, filePath: string, relativePath: string, - opts: { noEmbed?: boolean; inferFrontmatter?: boolean } = {}, + opts: { noEmbed?: boolean; inferFrontmatter?: boolean; sourceId?: string } = {}, ): Promise { // Defense-in-depth: reject symlinks before reading content. const lstat = lstatSync(filePath); @@ -379,7 +401,10 @@ export async function importFromFile( // Route code files through the code import path if (isCodeFilePath(relativePath)) { - return importCodeFile(engine, relativePath, content, opts); + return importCodeFile(engine, relativePath, content, { + noEmbed: opts.noEmbed, + sourceId: opts.sourceId, + }); } // v0.22.8 — Frontmatter inference: if the file has no frontmatter and @@ -431,11 +456,13 @@ export async function importCodeFile( engine: BrainEngine, relativePath: string, content: string, - opts: { noEmbed?: boolean; force?: boolean } = {}, + opts: { noEmbed?: boolean; force?: boolean; sourceId?: string } = {}, ): Promise { const slug = slugifyCodePath(relativePath); const lang = detectCodeLanguage(relativePath) || 'unknown'; const title = `${relativePath} (${lang})`; + const sourceId = opts.sourceId; + const txOpts = sourceId ? { sourceId } : undefined; const byteLength = Buffer.byteLength(content, 'utf-8'); if (byteLength > MAX_FILE_SIZE) { @@ -448,7 +475,7 @@ export async function importCodeFile( .update(JSON.stringify({ title, type: 'code', content, lang, chunker_version: CHUNKER_VERSION })) .digest('hex'); - const existing = await engine.getPage(slug); + const existing = await engine.getPage(slug, sourceId ? { sourceId } : undefined); if (!opts.force && existing?.content_hash === hash) { return { slug, status: 'skipped', chunks: 0 }; } @@ -486,7 +513,7 @@ export async function importCodeFile( // OpenAI API. Order matters: our chunk_index is semantic (tree-sitter // order), so a matching (chunk_index, text_hash) means a verbatim // preserved symbol. - const existingChunks = existing ? await engine.getChunks(slug) : []; + const existingChunks = existing ? await engine.getChunks(slug, sourceId ? { sourceId } : undefined) : []; const existingByKey = new Map(); for (const ec of existingChunks) { existingByKey.set(`${ec.chunk_index}:${ec.chunk_text}`, ec); @@ -519,9 +546,11 @@ export async function importCodeFile( } } - // Store + // Store. Every per-page tx call carries `txOpts.sourceId` so multi-source + // brains write to the correct (source_id, slug) row instead of duplicating + // under the schema DEFAULT. await engine.transaction(async (tx) => { - if (existing) await tx.createVersion(slug); + if (existing) await tx.createVersion(slug, txOpts); await tx.putPage(slug, { type: 'code' as PageType, @@ -531,15 +560,15 @@ export async function importCodeFile( timeline: '', frontmatter: { language: lang, file: relativePath }, content_hash: hash, - }); + }, txOpts); - await tx.addTag(slug, 'code'); - await tx.addTag(slug, lang); + await tx.addTag(slug, 'code', txOpts); + await tx.addTag(slug, lang, txOpts); if (chunks.length > 0) { - await tx.upsertChunks(slug, chunks); + await tx.upsertChunks(slug, chunks, txOpts); } else { - await tx.deleteChunks(slug); + await tx.deleteChunks(slug, txOpts); } }); @@ -550,7 +579,7 @@ export async function importCodeFile( // chunk IDs are stable. if (extractedEdges.length > 0 && chunks.length > 0) { try { - const persistedChunks = await engine.getChunks(slug); + const persistedChunks = await engine.getChunks(slug, sourceId ? { sourceId } : undefined); const byIndex = new Map(); for (const pc of persistedChunks) { byIndex.set(pc.chunk_index, pc); diff --git a/src/core/pglite-engine.ts b/src/core/pglite-engine.ts index 1439266be..52ffdaff6 100644 --- a/src/core/pglite-engine.ts +++ b/src/core/pglite-engine.ts @@ -486,12 +486,16 @@ export class PGLiteEngine implements BrainEngine { return rowToPage(rows[0] as Record); } - async putPage(slug: string, page: PageInput): Promise { + async putPage(slug: string, page: PageInput, opts?: { sourceId?: string }): Promise { slug = validateSlug(slug); const hash = page.content_hash || contentHash(page); const frontmatter = page.frontmatter || {}; + const sourceId = opts?.sourceId ?? 'default'; - // v0.18.0 Step 2: source_id relies on the schema DEFAULT 'default'. + // v0.18.0 Step 5+: source_id is now in the INSERT column list so multi- + // source callers land on the intended (source_id, slug) row. Omitting it + // let the schema DEFAULT 'default' apply, fabricating duplicate slugs that + // later made bare-slug subqueries return multiple rows. // ON CONFLICT target is (source_id, slug); global UNIQUE(slug) dropped in v17. const pageKind = page.page_kind || 'markdown'; // v0.29.1 — additive opt-in columns. COALESCE(EXCLUDED.x, pages.x) @@ -503,8 +507,8 @@ export class PGLiteEngine implements BrainEngine { const effectiveDateSource = page.effective_date_source ?? null; const importFilename = page.import_filename ?? null; const { rows } = await this.db.query( - `INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) - VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb, $8, now(), $9::timestamptz, $10, $11) + `INSERT INTO pages (source_id, slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb, $9, now(), $10::timestamptz, $11, $12) ON CONFLICT (source_id, slug) DO UPDATE SET type = EXCLUDED.type, page_kind = EXCLUDED.page_kind, @@ -518,13 +522,17 @@ export class PGLiteEngine implements BrainEngine { effective_date_source = COALESCE(EXCLUDED.effective_date_source, pages.effective_date_source), import_filename = COALESCE(EXCLUDED.import_filename, pages.import_filename) RETURNING id, slug, type, title, compiled_truth, timeline, frontmatter, content_hash, created_at, updated_at, effective_date, effective_date_source, import_filename`, - [slug, page.type, pageKind, page.title, page.compiled_truth, page.timeline || '', JSON.stringify(frontmatter), hash, effectiveDate, effectiveDateSource, importFilename] + [sourceId, slug, page.type, pageKind, page.title, page.compiled_truth, page.timeline || '', JSON.stringify(frontmatter), hash, effectiveDate, effectiveDateSource, importFilename] ); return rowToPage(rows[0] as Record); } - async deletePage(slug: string): Promise { - await this.db.query('DELETE FROM pages WHERE slug = $1', [slug]); + async deletePage(slug: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + await this.db.query( + 'DELETE FROM pages WHERE slug = $1 AND source_id = $2', + [slug, sourceId] + ); } async softDeletePage(slug: string, opts?: { sourceId?: string }): Promise<{ slug: string } | null> { @@ -921,10 +929,16 @@ export class PGLiteEngine implements BrainEngine { } // Chunks - async upsertChunks(slug: string, chunks: ChunkInput[]): Promise { - // Get page_id - const pageResult = await this.db.query('SELECT id FROM pages WHERE slug = $1', [slug]); - if (pageResult.rows.length === 0) throw new Error(`Page not found: ${slug}`); + async upsertChunks(slug: string, chunks: ChunkInput[], opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + + // Source-scope the page-id lookup so duplicate slugs in different sources + // do not return multiple rows or target the wrong page. + const pageResult = await this.db.query( + 'SELECT id FROM pages WHERE slug = $1 AND source_id = $2', + [slug, sourceId] + ); + if (pageResult.rows.length === 0) throw new Error(`Page not found: ${slug} (source=${sourceId})`); const pageId = (pageResult.rows[0] as { id: number }).id; // Remove chunks that no longer exist @@ -1025,13 +1039,14 @@ export class PGLiteEngine implements BrainEngine { ); } - async getChunks(slug: string): Promise { + async getChunks(slug: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; const { rows } = await this.db.query( `SELECT cc.* FROM content_chunks cc JOIN pages p ON p.id = cc.page_id - WHERE p.slug = $1 + WHERE p.slug = $1 AND p.source_id = $2 ORDER BY cc.chunk_index`, - [slug] + [slug, sourceId] ); return (rows as Record[]).map(r => rowToChunk(r)); } @@ -1059,11 +1074,13 @@ export class PGLiteEngine implements BrainEngine { return rows as unknown as StaleChunkRow[]; } - async deleteChunks(slug: string): Promise { + async deleteChunks(slug: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + // Source-qualify the page-id subquery; slugs are only unique per source. await this.db.query( `DELETE FROM content_chunks - WHERE page_id = (SELECT id FROM pages WHERE slug = $1)`, - [slug] + WHERE page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2)`, + [slug, sourceId] ); } @@ -1076,19 +1093,38 @@ export class PGLiteEngine implements BrainEngine { linkSource?: string, originSlug?: string, originField?: string, + opts?: { fromSourceId?: string; toSourceId?: string; originSourceId?: string }, ): Promise { + const fromSrc = opts?.fromSourceId ?? 'default'; + const toSrc = opts?.toSourceId ?? 'default'; + const originSrc = opts?.originSourceId ?? 'default'; + + // Source-qualified pre-check gives a clean missing-page error before the + // INSERT SELECT path can silently return zero rows. + const exists = await this.db.query( + `SELECT 1 FROM pages WHERE slug = $1 AND source_id = $2 + INTERSECT + SELECT 1 FROM pages WHERE slug = $3 AND source_id = $4`, + [from, fromSrc, to, toSrc] + ); + if (exists.rows.length === 0) { + throw new Error(`addLink failed: page "${from}" (source=${fromSrc}) or "${to}" (source=${toSrc}) not found`); + } const src = linkSource ?? 'markdown'; + // Mirror addLinksBatch's VALUES + composite JOIN shape. The old cross- + // product over pages f/t fanned out across sources containing the slugs. await this.db.query( `INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field) - SELECT f.id, t.id, $3, $4, $5, - (SELECT id FROM pages WHERE slug = $6), - $7 - FROM pages f, pages t - WHERE f.slug = $1 AND t.slug = $2 + SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field + FROM (VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)) + AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id) + JOIN pages f ON f.slug = v.from_slug AND f.source_id = v.from_source_id + JOIN pages t ON t.slug = v.to_slug AND t.source_id = v.to_source_id + LEFT JOIN pages o ON o.slug = v.origin_slug AND o.source_id = v.origin_source_id ON CONFLICT (from_page_id, to_page_id, link_type, link_source, origin_page_id) DO UPDATE SET context = EXCLUDED.context, origin_field = EXCLUDED.origin_field`, - [from, to, linkType || '', context || '', src, originSlug ?? null, originField ?? null] + [from, to, linkType || '', context || '', src, originSlug ?? null, originField ?? null, fromSrc, toSrc, originSrc] ); } @@ -1127,38 +1163,48 @@ export class PGLiteEngine implements BrainEngine { return result.rows.length; } - async removeLink(from: string, to: string, linkType?: string, linkSource?: string): Promise { + async removeLink( + from: string, + to: string, + linkType?: string, + linkSource?: string, + opts?: { fromSourceId?: string; toSourceId?: string }, + ): Promise { + const fromSrc = opts?.fromSourceId ?? 'default'; + const toSrc = opts?.toSourceId ?? 'default'; + // Each branch source-qualifies page-id subqueries so a delete only targets + // the intended edge between per-source slug rows. if (linkType !== undefined && linkSource !== undefined) { await this.db.query( `DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1) - AND to_page_id = (SELECT id FROM pages WHERE slug = $2) - AND link_type = $3 - AND link_source IS NOT DISTINCT FROM $4`, - [from, to, linkType, linkSource] + WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) + AND to_page_id = (SELECT id FROM pages WHERE slug = $3 AND source_id = $4) + AND link_type = $5 + AND link_source IS NOT DISTINCT FROM $6`, + [from, fromSrc, to, toSrc, linkType, linkSource] ); } else if (linkType !== undefined) { await this.db.query( `DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1) - AND to_page_id = (SELECT id FROM pages WHERE slug = $2) - AND link_type = $3`, - [from, to, linkType] + WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) + AND to_page_id = (SELECT id FROM pages WHERE slug = $3 AND source_id = $4) + AND link_type = $5`, + [from, fromSrc, to, toSrc, linkType] ); } else if (linkSource !== undefined) { await this.db.query( `DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1) - AND to_page_id = (SELECT id FROM pages WHERE slug = $2) - AND link_source IS NOT DISTINCT FROM $3`, - [from, to, linkSource] + WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) + AND to_page_id = (SELECT id FROM pages WHERE slug = $3 AND source_id = $4) + AND link_source IS NOT DISTINCT FROM $5`, + [from, fromSrc, to, toSrc, linkSource] ); } else { await this.db.query( `DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1) - AND to_page_id = (SELECT id FROM pages WHERE slug = $2)`, - [from, to] + WHERE from_page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) + AND to_page_id = (SELECT id FROM pages WHERE slug = $3 AND source_id = $4)`, + [from, fromSrc, to, toSrc] ); } } @@ -1456,30 +1502,42 @@ export class PGLiteEngine implements BrainEngine { } // Tags - async addTag(slug: string, tag: string): Promise { + async addTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + // Pre-check source-scoped page existence; ON CONFLICT only handles the + // already-tagged case, not missing pages. + const page = await this.db.query( + 'SELECT id FROM pages WHERE slug = $1 AND source_id = $2', + [slug, sourceId] + ); + if (page.rows.length === 0) throw new Error(`addTag failed: page "${slug}" (source=${sourceId}) not found`); await this.db.query( `INSERT INTO tags (page_id, tag) - SELECT id, $2 FROM pages WHERE slug = $1 + VALUES ($1, $2) ON CONFLICT (page_id, tag) DO NOTHING`, - [slug, tag] + [(page.rows[0] as { id: number }).id, tag] ); } - async removeTag(slug: string, tag: string): Promise { + async removeTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + // Source-qualify the page-id subquery; slugs are only unique per source. await this.db.query( `DELETE FROM tags - WHERE page_id = (SELECT id FROM pages WHERE slug = $1) - AND tag = $2`, - [slug, tag] + WHERE page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) + AND tag = $3`, + [slug, sourceId, tag] ); } - async getTags(slug: string): Promise { + async getTags(slug: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; + // Source-qualify the page-id subquery; slugs are only unique per source. const { rows } = await this.db.query( `SELECT tag FROM tags - WHERE page_id = (SELECT id FROM pages WHERE slug = $1) + WHERE page_id = (SELECT id FROM pages WHERE slug = $1 AND source_id = $2) ORDER BY tag`, - [slug] + [slug, sourceId] ); return (rows as { tag: string }[]).map(r => r.tag); } @@ -1488,22 +1546,27 @@ export class PGLiteEngine implements BrainEngine { async addTimelineEntry( slug: string, entry: TimelineInput, - opts?: { skipExistenceCheck?: boolean }, + opts?: { skipExistenceCheck?: boolean; sourceId?: string }, ): Promise { + const sourceId = opts?.sourceId ?? 'default'; if (!opts?.skipExistenceCheck) { - const { rows } = await this.db.query('SELECT 1 FROM pages WHERE slug = $1', [slug]); + const { rows } = await this.db.query( + 'SELECT 1 FROM pages WHERE slug = $1 AND source_id = $2', + [slug, sourceId] + ); if (rows.length === 0) { - throw new Error(`Page not found: ${slug}`); + throw new Error(`addTimelineEntry failed: page "${slug}" (source=${sourceId}) not found`); } } // ON CONFLICT DO NOTHING via the (page_id, date, summary) unique index. - // If insert is a no-op (duplicate), no row is returned; that's intentional. + // Source-qualify the page-id lookup so multi-source brains don't fan + // timeline rows out across every source containing the slug. await this.db.query( `INSERT INTO timeline_entries (page_id, date, source, summary, detail) SELECT id, $2::date, $3, $4, $5 - FROM pages WHERE slug = $1 + FROM pages WHERE slug = $1 AND source_id = $6 ON CONFLICT (page_id, date, summary) DO NOTHING`, - [slug, entry.date, entry.source || '', entry.summary, entry.detail || ''] + [slug, entry.date, entry.source || '', entry.summary, entry.detail || '', sourceId] ); } @@ -2063,14 +2126,16 @@ export class PGLiteEngine implements BrainEngine { } // Versions - async createVersion(slug: string): Promise { + async createVersion(slug: string, opts?: { sourceId?: string }): Promise { + const sourceId = opts?.sourceId ?? 'default'; const { rows } = await this.db.query( `INSERT INTO page_versions (page_id, compiled_truth, frontmatter) SELECT id, compiled_truth, frontmatter - FROM pages WHERE slug = $1 + FROM pages WHERE slug = $1 AND source_id = $2 RETURNING *`, - [slug] + [slug, sourceId] ); + if (rows.length === 0) throw new Error(`createVersion failed: page "${slug}" (source=${sourceId}) not found`); return rows[0] as unknown as PageVersion; } @@ -2238,11 +2303,14 @@ export class PGLiteEngine implements BrainEngine { } // Sync - async updateSlug(oldSlug: string, newSlug: string): Promise { + async updateSlug(oldSlug: string, newSlug: string, opts?: { sourceId?: string }): Promise { newSlug = validateSlug(newSlug); + const sourceId = opts?.sourceId ?? 'default'; + // Source-qualify so a rename in source A doesn't sweep up same-slug rows + // in sources B/C/D (mirrors postgres-engine.ts). await this.db.query( - `UPDATE pages SET slug = $1, updated_at = now() WHERE slug = $2`, - [newSlug, oldSlug] + `UPDATE pages SET slug = $1, updated_at = now() WHERE slug = $2 AND source_id = $3`, + [newSlug, oldSlug, sourceId] ); } diff --git a/src/core/postgres-engine.ts b/src/core/postgres-engine.ts index 225b7ed9d..6f4b70216 100644 --- a/src/core/postgres-engine.ts +++ b/src/core/postgres-engine.ts @@ -543,15 +543,20 @@ export class PostgresEngine implements BrainEngine { return rowToPage(rows[0]); } - async putPage(slug: string, page: PageInput): Promise { + async putPage(slug: string, page: PageInput, opts?: { sourceId?: string }): Promise { slug = validateSlug(slug); const sql = this.sql; const hash = page.content_hash || contentHash(page); const frontmatter = page.frontmatter || {}; - - // v0.18.0 Step 2: source_id relies on schema DEFAULT 'default'. ON - // CONFLICT target becomes (source_id, slug) since global UNIQUE(slug) - // was dropped in migration v17. + const sourceId = opts?.sourceId ?? 'default'; + + // v0.18.0 Step 5+: source_id is now in the INSERT column list so multi- + // source callers actually land on the (source_id, slug) row they intend. + // Pre-fix: omitting source_id let the schema DEFAULT 'default' apply, so + // a caller syncing under 'jarvis-memory' silently fabricated a duplicate + // at (default, slug); subsequent bare-slug subqueries (getTags, deleteChunks, + // etc.) then matched 2 rows and blew up with Postgres 21000. + // ON CONFLICT target is (source_id, slug); global UNIQUE(slug) dropped in v17. const pageKind = page.page_kind || 'markdown'; // v0.29.1 — effective_date / effective_date_source / import_filename are // additive opt-in inputs from the importer (computeEffectiveDate). When @@ -562,8 +567,8 @@ export class PostgresEngine implements BrainEngine { const effectiveDateSource = page.effective_date_source ?? null; const importFilename = page.import_filename ?? null; const rows = await sql` - INSERT INTO pages (slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) - VALUES (${slug}, ${page.type}, ${pageKind}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${sql.json(frontmatter as Parameters[0])}, ${hash}, now(), ${effectiveDate}, ${effectiveDateSource}, ${importFilename}) + INSERT INTO pages (source_id, slug, type, page_kind, title, compiled_truth, timeline, frontmatter, content_hash, updated_at, effective_date, effective_date_source, import_filename) + VALUES (${sourceId}, ${slug}, ${page.type}, ${pageKind}, ${page.title}, ${page.compiled_truth}, ${page.timeline || ''}, ${sql.json(frontmatter as Parameters[0])}, ${hash}, now(), ${effectiveDate}, ${effectiveDateSource}, ${importFilename}) ON CONFLICT (source_id, slug) DO UPDATE SET type = EXCLUDED.type, page_kind = EXCLUDED.page_kind, @@ -581,9 +586,10 @@ export class PostgresEngine implements BrainEngine { return rowToPage(rows[0]); } - async deletePage(slug: string): Promise { + async deletePage(slug: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; - await sql`DELETE FROM pages WHERE slug = ${slug}`; + const sourceId = opts?.sourceId ?? 'default'; + await sql`DELETE FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}`; } async softDeletePage(slug: string, opts?: { sourceId?: string }): Promise<{ slug: string } | null> { @@ -1062,12 +1068,15 @@ export class PostgresEngine implements BrainEngine { } // Chunks - async upsertChunks(slug: string, chunks: ChunkInput[]): Promise { + async upsertChunks(slug: string, chunks: ChunkInput[], opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; - // Get page_id - const pages = await sql`SELECT id FROM pages WHERE slug = ${slug}`; - if (pages.length === 0) throw new Error(`Page not found: ${slug}`); + // Source-scope the page-id lookup. Without this filter, multi-source + // brains where the slug exists in 2+ sources return >1 row and the + // chunk replacement targets the wrong page (or fans out across pages). + const pages = await sql`SELECT id FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}`; + if (pages.length === 0) throw new Error(`Page not found: ${slug} (source=${sourceId})`); const pageId = pages[0].id; // Remove chunks that no longer exist (chunk_index beyond new count) @@ -1163,12 +1172,13 @@ export class PostgresEngine implements BrainEngine { ); } - async getChunks(slug: string): Promise { + async getChunks(slug: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; const rows = await sql` SELECT cc.* FROM content_chunks cc JOIN pages p ON p.id = cc.page_id - WHERE p.slug = ${slug} + WHERE p.slug = ${slug} AND p.source_id = ${sourceId} ORDER BY cc.chunk_index `; return rows.map((r) => rowToChunk(r as Record)); @@ -1198,11 +1208,12 @@ export class PostgresEngine implements BrainEngine { return rows as unknown as StaleChunkRow[]; } - async deleteChunks(slug: string): Promise { + async deleteChunks(slug: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; await sql` DELETE FROM content_chunks - WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug}) + WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}) `; } @@ -1215,28 +1226,39 @@ export class PostgresEngine implements BrainEngine { linkSource?: string, originSlug?: string, originField?: string, + opts?: { fromSourceId?: string; toSourceId?: string; originSourceId?: string }, ): Promise { const sql = this.sql; + const fromSrc = opts?.fromSourceId ?? 'default'; + const toSrc = opts?.toSourceId ?? 'default'; + const originSrc = opts?.originSourceId ?? 'default'; + // Pre-check existence so we can throw a clear error (ON CONFLICT DO UPDATE - // returns 0 rows when source SELECT is empty, indistinguishable from missing page). + // returns 0 rows when source SELECT is empty, indistinguishable from missing + // page). Source-qualified — pre-v0.18 the bare slug check matched ANY source, + // letting addLink succeed even when the intended source row was missing. const exists = await sql` - SELECT 1 FROM pages WHERE slug = ${from} + SELECT 1 FROM pages WHERE slug = ${from} AND source_id = ${fromSrc} INTERSECT - SELECT 1 FROM pages WHERE slug = ${to} + SELECT 1 FROM pages WHERE slug = ${to} AND source_id = ${toSrc} `; if (exists.length === 0) { - throw new Error(`addLink failed: page "${from}" or "${to}" not found`); + throw new Error(`addLink failed: page "${from}" (source=${fromSrc}) or "${to}" (source=${toSrc}) not found`); } // Default link_source to 'markdown' for back-compat with pre-v0.13 callers. - // origin_page_id resolves from originSlug via the pages join (NULL if no slug). + // Mirror addLinksBatch's VALUES + JOIN-on-(slug, source_id) shape. The old + // `FROM pages f, pages t` cross-product fanned out across every source + // containing either slug, so a multi-source brain silently created edges + // pointing at the wrong pages. const src = linkSource ?? 'markdown'; await sql` INSERT INTO links (from_page_id, to_page_id, link_type, context, link_source, origin_page_id, origin_field) - SELECT f.id, t.id, ${linkType || ''}, ${context || ''}, ${src}, - (SELECT id FROM pages WHERE slug = ${originSlug ?? null}), - ${originField ?? null} - FROM pages f, pages t - WHERE f.slug = ${from} AND t.slug = ${to} + SELECT f.id, t.id, v.link_type, v.context, v.link_source, o.id, v.origin_field + FROM (VALUES (${from}, ${to}, ${linkType || ''}, ${context || ''}, ${src}, ${originSlug ?? null}, ${originField ?? null}, ${fromSrc}, ${toSrc}, ${originSrc})) + AS v(from_slug, to_slug, link_type, context, link_source, origin_slug, origin_field, from_source_id, to_source_id, origin_source_id) + JOIN pages f ON f.slug = v.from_slug AND f.source_id = v.from_source_id + JOIN pages t ON t.slug = v.to_slug AND t.source_id = v.to_source_id + LEFT JOIN pages o ON o.slug = v.origin_slug AND o.source_id = v.origin_source_id ON CONFLICT (from_page_id, to_page_id, link_type, link_source, origin_page_id) DO UPDATE SET context = EXCLUDED.context, origin_field = EXCLUDED.origin_field @@ -1282,37 +1304,47 @@ export class PostgresEngine implements BrainEngine { return result.length; } - async removeLink(from: string, to: string, linkType?: string, linkSource?: string): Promise { + async removeLink( + from: string, + to: string, + linkType?: string, + linkSource?: string, + opts?: { fromSourceId?: string; toSourceId?: string }, + ): Promise { const sql = this.sql; + const fromSrc = opts?.fromSourceId ?? 'default'; + const toSrc = opts?.toSourceId ?? 'default'; // Build up filters dynamically. linkType + linkSource are independent - // optional constraints; all four combinations are valid. + // optional constraints; all four combinations are valid. Each branch's + // page-id subquery is source-qualified so multi-source brains don't + // delete the wrong (from, to) pair. if (linkType !== undefined && linkSource !== undefined) { await sql` DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from}) - AND to_page_id = (SELECT id FROM pages WHERE slug = ${to}) + WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from} AND source_id = ${fromSrc}) + AND to_page_id = (SELECT id FROM pages WHERE slug = ${to} AND source_id = ${toSrc}) AND link_type = ${linkType} AND link_source IS NOT DISTINCT FROM ${linkSource} `; } else if (linkType !== undefined) { await sql` DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from}) - AND to_page_id = (SELECT id FROM pages WHERE slug = ${to}) + WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from} AND source_id = ${fromSrc}) + AND to_page_id = (SELECT id FROM pages WHERE slug = ${to} AND source_id = ${toSrc}) AND link_type = ${linkType} `; } else if (linkSource !== undefined) { await sql` DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from}) - AND to_page_id = (SELECT id FROM pages WHERE slug = ${to}) + WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from} AND source_id = ${fromSrc}) + AND to_page_id = (SELECT id FROM pages WHERE slug = ${to} AND source_id = ${toSrc}) AND link_source IS NOT DISTINCT FROM ${linkSource} `; } else { await sql` DELETE FROM links - WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from}) - AND to_page_id = (SELECT id FROM pages WHERE slug = ${to}) + WHERE from_page_id = (SELECT id FROM pages WHERE slug = ${from} AND source_id = ${fromSrc}) + AND to_page_id = (SELECT id FROM pages WHERE slug = ${to} AND source_id = ${toSrc}) `; } } @@ -1619,12 +1651,15 @@ export class PostgresEngine implements BrainEngine { } // Tags - async addTag(slug: string, tag: string): Promise { + async addTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; // Verify page exists before attempting insert (ON CONFLICT DO NOTHING - // swallows the "already tagged" case, but we still need to detect missing pages) - const page = await sql`SELECT id FROM pages WHERE slug = ${slug}`; - if (page.length === 0) throw new Error(`addTag failed: page "${slug}" not found`); + // swallows the "already tagged" case, but we still need to detect missing + // pages). Source-scoped lookup — pre-v0.18 the bare-slug subquery returned + // multiple rows in multi-source brains and crashed with Postgres 21000. + const page = await sql`SELECT id FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}`; + if (page.length === 0) throw new Error(`addTag failed: page "${slug}" (source=${sourceId}) not found`); await sql` INSERT INTO tags (page_id, tag) VALUES (${page[0].id}, ${tag}) @@ -1632,20 +1667,22 @@ export class PostgresEngine implements BrainEngine { `; } - async removeTag(slug: string, tag: string): Promise { + async removeTag(slug: string, tag: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; await sql` DELETE FROM tags - WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug}) + WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}) AND tag = ${tag} `; } - async getTags(slug: string): Promise { + async getTags(slug: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; const rows = await sql` SELECT tag FROM tags - WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug}) + WHERE page_id = (SELECT id FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}) ORDER BY tag `; return rows.map((r) => r.tag as string); @@ -1655,22 +1692,25 @@ export class PostgresEngine implements BrainEngine { async addTimelineEntry( slug: string, entry: TimelineInput, - opts?: { skipExistenceCheck?: boolean }, + opts?: { skipExistenceCheck?: boolean; sourceId?: string }, ): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; if (!opts?.skipExistenceCheck) { - const exists = await sql`SELECT 1 FROM pages WHERE slug = ${slug}`; + const exists = await sql`SELECT 1 FROM pages WHERE slug = ${slug} AND source_id = ${sourceId}`; if (exists.length === 0) { - throw new Error(`addTimelineEntry failed: page "${slug}" not found`); + throw new Error(`addTimelineEntry failed: page "${slug}" (source=${sourceId}) not found`); } } // ON CONFLICT DO NOTHING via the (page_id, date, summary) unique index. // Returning 0 rows means either page missing OR duplicate; skipExistenceCheck - // makes that ambiguity safe (caller asserts page exists). + // makes that ambiguity safe (caller asserts page exists). Source-qualify + // the page-id lookup so multi-source brains don't fan timeline rows out + // across every source containing the slug. await sql` INSERT INTO timeline_entries (page_id, date, source, summary, detail) SELECT id, ${entry.date}::date, ${entry.source || ''}, ${entry.summary}, ${entry.detail || ''} - FROM pages WHERE slug = ${slug} + FROM pages WHERE slug = ${slug} AND source_id = ${sourceId} ON CONFLICT (page_id, date, summary) DO NOTHING `; } @@ -2192,15 +2232,16 @@ export class PostgresEngine implements BrainEngine { } // Versions - async createVersion(slug: string): Promise { + async createVersion(slug: string, opts?: { sourceId?: string }): Promise { const sql = this.sql; + const sourceId = opts?.sourceId ?? 'default'; const rows = await sql` INSERT INTO page_versions (page_id, compiled_truth, frontmatter) SELECT id, compiled_truth, frontmatter - FROM pages WHERE slug = ${slug} + FROM pages WHERE slug = ${slug} AND source_id = ${sourceId} RETURNING * `; - if (rows.length === 0) throw new Error(`createVersion failed: page "${slug}" not found`); + if (rows.length === 0) throw new Error(`createVersion failed: page "${slug}" (source=${sourceId}) not found`); return rows[0] as unknown as PageVersion; } @@ -2370,10 +2411,14 @@ export class PostgresEngine implements BrainEngine { } // Sync - async updateSlug(oldSlug: string, newSlug: string): Promise { + async updateSlug(oldSlug: string, newSlug: string, opts?: { sourceId?: string }): Promise { newSlug = validateSlug(newSlug); const sql = this.sql; - await sql`UPDATE pages SET slug = ${newSlug}, updated_at = now() WHERE slug = ${oldSlug}`; + const sourceId = opts?.sourceId ?? 'default'; + // Source-qualify so a rename in source A doesn't sweep up same-slug rows + // in sources B/C/D (which would either rename them all OR fail the + // (source_id, slug) UNIQUE if the new slug already exists in another source). + await sql`UPDATE pages SET slug = ${newSlug}, updated_at = now() WHERE slug = ${oldSlug} AND source_id = ${sourceId}`; } async rewriteLinks(_oldSlug: string, _newSlug: string): Promise { diff --git a/test/source-id-tx-regression.test.ts b/test/source-id-tx-regression.test.ts new file mode 100644 index 000000000..b267672dc --- /dev/null +++ b/test/source-id-tx-regression.test.ts @@ -0,0 +1,466 @@ +/** + * v0.18.0+ Step 5+ regression — source_id threading through the per-page + * transaction surface (putPage / createVersion / getTags / addTag / removeTag / + * deleteChunks / upsertChunks / addLink / removeLink). + * + * Pre-fix bug: + * - putPage omitted source_id from its INSERT column list, so the schema + * DEFAULT 'default' was applied even when the caller meant to write under + * a non-default source (e.g. 'jarvis-memory'). When the same slug already + * existed under the intended source, putPage silently fabricated a + * duplicate row at (default, slug). Both rows then coexisted under the + * composite UNIQUE. + * - Subsequent bare-slug subqueries inside the same transaction — + * `(SELECT id FROM pages WHERE slug = $1)` in getTags / removeTag / + * deleteChunks / removeLink — returned 2 rows and crashed with Postgres + * 21000 ("more than one row returned by a subquery used as an expression"), + * rolling back the entire tx. + * + * Fix: + * - putPage adds source_id to the INSERT column list (defaults to 'default' + * when opts.sourceId is omitted, preserving back-compat). + * - Every bare-slug page-id subquery becomes source-qualified + * (`AND source_id = $X`), eliminating the multi-row fan-out. + * - addLink converts away from `FROM pages f, pages t` cross-product and + * mirrors addLinksBatch's VALUES + JOIN-on-(slug, source_id) shape. + * + * Backwards-compat: every method's opts param is optional. Existing callers + * that don't pass sourceId continue to target source 'default' (the schema + * default) and behave identically to pre-fix. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { runSources } from '../src/commands/sources.ts'; +import { importFromContent } from '../src/core/import-file.ts'; + +let engine: PGLiteEngine; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({ type: 'pglite' } as never); + await engine.initSchema(); + // Add the second source up-front; tests below assume both 'default' and + // 'testsrc' exist. + await runSources(engine, ['add', 'testsrc', '--no-federated']); +}, 60_000); + +afterAll(async () => { + if (engine) await engine.disconnect(); +}, 60_000); + +const SLUG = 'topics/source-id-regression'; + +describe('putPage threads source_id into the INSERT column list', () => { + test('putPage with opts.sourceId writes under the intended source', async () => { + await engine.putPage(SLUG, { + type: 'concept', + title: 'Default-source variant', + compiled_truth: 'Lives under source=default.', + }); + await engine.putPage(SLUG, { + type: 'concept', + title: 'Testsrc-source variant', + compiled_truth: 'Lives under source=testsrc.', + }, { sourceId: 'testsrc' }); + + const rows = await engine.executeRaw<{ source_id: string; title: string }>( + `SELECT source_id, title FROM pages WHERE slug = $1 ORDER BY source_id`, + [SLUG], + ); + expect(rows.length).toBe(2); + expect(rows[0].source_id).toBe('default'); + expect(rows[0].title).toBe('Default-source variant'); + expect(rows[1].source_id).toBe('testsrc'); + expect(rows[1].title).toBe('Testsrc-source variant'); + }); + + test('putPage without opts.sourceId still targets source=default (back-compat)', async () => { + // Call again under default to verify the no-opts path still hits the same + // (default, slug) row rather than fabricating a duplicate. + const updated = await engine.putPage(SLUG, { + type: 'concept', + title: 'Default-source updated', + compiled_truth: 'Updated content.', + }); + expect(updated.title).toBe('Default-source updated'); + + const rows = await engine.executeRaw<{ source_id: string; title: string }>( + `SELECT source_id, title FROM pages WHERE slug = $1 ORDER BY source_id`, + [SLUG], + ); + // Still exactly two rows — no duplicate fabricated. + expect(rows.length).toBe(2); + expect(rows.find(r => r.source_id === 'default')!.title).toBe('Default-source updated'); + expect(rows.find(r => r.source_id === 'testsrc')!.title).toBe('Testsrc-source variant'); + }); +}); + +describe('Per-page tx methods source-qualify their bare-slug subqueries', () => { + test('getTags(slug, { sourceId }) returns scoped tags without 21000', async () => { + // Pre-fix: this call would crash because the bare-slug subquery + // `(SELECT id FROM pages WHERE slug = $1)` matched both rows. + await engine.addTag(SLUG, 'shared-by-default', { sourceId: 'default' }); + await engine.addTag(SLUG, 'unique-to-testsrc', { sourceId: 'testsrc' }); + await engine.addTag(SLUG, 'also-shared', { sourceId: 'default' }); + await engine.addTag(SLUG, 'also-shared', { sourceId: 'testsrc' }); + + const defaultTags = await engine.getTags(SLUG, { sourceId: 'default' }); + expect(defaultTags.sort()).toEqual(['also-shared', 'shared-by-default']); + + const testsrcTags = await engine.getTags(SLUG, { sourceId: 'testsrc' }); + expect(testsrcTags.sort()).toEqual(['also-shared', 'unique-to-testsrc']); + }); + + test('removeTag(slug, tag, { sourceId }) only removes from one source', async () => { + await engine.removeTag(SLUG, 'also-shared', { sourceId: 'testsrc' }); + expect((await engine.getTags(SLUG, { sourceId: 'default' })).sort()) + .toEqual(['also-shared', 'shared-by-default']); + expect((await engine.getTags(SLUG, { sourceId: 'testsrc' })).sort()) + .toEqual(['unique-to-testsrc']); + }); + + test('deleteChunks(slug, { sourceId }) only deletes one source\'s chunks', async () => { + await engine.upsertChunks(SLUG, [ + { chunk_index: 0, chunk_text: 'default chunk 0', chunk_source: 'compiled_truth' }, + ], { sourceId: 'default' }); + await engine.upsertChunks(SLUG, [ + { chunk_index: 0, chunk_text: 'testsrc chunk 0', chunk_source: 'compiled_truth' }, + ], { sourceId: 'testsrc' }); + + const beforeRows = await engine.executeRaw<{ source_id: string; chunk_text: string }>( + `SELECT p.source_id, cc.chunk_text + FROM content_chunks cc + JOIN pages p ON p.id = cc.page_id + WHERE p.slug = $1 + ORDER BY p.source_id`, + [SLUG], + ); + expect(beforeRows.length).toBe(2); + + await engine.deleteChunks(SLUG, { sourceId: 'testsrc' }); + + const afterRows = await engine.executeRaw<{ source_id: string; chunk_text: string }>( + `SELECT p.source_id, cc.chunk_text + FROM content_chunks cc + JOIN pages p ON p.id = cc.page_id + WHERE p.slug = $1`, + [SLUG], + ); + expect(afterRows.length).toBe(1); + expect(afterRows[0].source_id).toBe('default'); + }); + + test('createVersion(slug, { sourceId }) snapshots the right row', async () => { + const v = await engine.createVersion(SLUG, { sourceId: 'testsrc' }); + expect(v).toBeDefined(); + const rows = await engine.executeRaw<{ source_id: string; compiled_truth: string }>( + `SELECT p.source_id, pv.compiled_truth + FROM page_versions pv + JOIN pages p ON p.id = pv.page_id + WHERE p.slug = $1 + ORDER BY pv.snapshot_at DESC + LIMIT 1`, + [SLUG], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('testsrc'); + expect(rows[0].compiled_truth).toBe('Lives under source=testsrc.'); + }); +}); + +describe('addLink rewrites the cross-product into a source-qualified JOIN', () => { + const FROM_SLUG = 'topics/regression-link-from'; + const TO_SLUG = 'topics/regression-link-to'; + + test('addLink with opts.{from,to,origin}SourceId targets the right rows', async () => { + // Set up: same (from, to) slug pair under both default and testsrc. + await engine.putPage(FROM_SLUG, { type: 'concept', title: 'F default', compiled_truth: '' }); + await engine.putPage(TO_SLUG, { type: 'concept', title: 'T default', compiled_truth: '' }); + await engine.putPage(FROM_SLUG, { type: 'concept', title: 'F testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + await engine.putPage(TO_SLUG, { type: 'concept', title: 'T testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + + // Add an edge under testsrc only. + await engine.addLink( + FROM_SLUG, TO_SLUG, 'testsrc edge', 'documents', 'markdown', undefined, undefined, + { fromSourceId: 'testsrc', toSourceId: 'testsrc', originSourceId: 'testsrc' }, + ); + + // Verify the link's endpoints both point at the testsrc rows, not the + // default rows. Pre-fix, the cross-product `FROM pages f, pages t` would + // pick whichever order Postgres returned; the source filter eliminates + // that fan-out. + const rows = await engine.executeRaw<{ from_src: string; to_src: string; context: string }>( + `SELECT f.source_id AS from_src, t.source_id AS to_src, l.context + FROM links l + JOIN pages f ON f.id = l.from_page_id + JOIN pages t ON t.id = l.to_page_id + WHERE l.context = 'testsrc edge'`, + ); + expect(rows.length).toBe(1); + expect(rows[0].from_src).toBe('testsrc'); + expect(rows[0].to_src).toBe('testsrc'); + }); + + test('addLink with no opts defaults to source=default (back-compat)', async () => { + await engine.addLink( + FROM_SLUG, TO_SLUG, 'default edge', 'documents', 'markdown', + ); + const rows = await engine.executeRaw<{ from_src: string; to_src: string }>( + `SELECT f.source_id AS from_src, t.source_id AS to_src + FROM links l + JOIN pages f ON f.id = l.from_page_id + JOIN pages t ON t.id = l.to_page_id + WHERE l.context = 'default edge'`, + ); + expect(rows.length).toBe(1); + expect(rows[0].from_src).toBe('default'); + expect(rows[0].to_src).toBe('default'); + }); + + test('addLink fails fast when the source-qualified endpoint doesn\'t exist', async () => { + // Pre-fix: cross-product would silently fall back to the wrong source + // pair and succeed. Post-fix: missing-source-row → no JOIN match → no row + // inserted → INTERSECT pre-check throws. + let err: Error | null = null; + try { + await engine.addLink( + FROM_SLUG, TO_SLUG, 'phantom edge', 'documents', 'markdown', undefined, undefined, + { fromSourceId: 'nonexistent-src', toSourceId: 'nonexistent-src' }, + ); + } catch (e) { + err = e as Error; + } + expect(err).not.toBeNull(); + expect(err!.message).toMatch(/not found/); + }); +}); + +describe('importFromContent threads sourceId through the entire transaction body', () => { + const IMP_SLUG = 'topics/regression-import-thread'; + + test('importFromContent under source=testsrc does not fabricate a (default, slug) duplicate', async () => { + // Pre-seed a default-source row at the same slug to prove the fix actually + // discriminates: pre-fix, importing under testsrc would have ALSO touched + // the default row (or duplicated it) and the bare-slug getTags inside the + // tx would crash with 21000. + await engine.putPage(IMP_SLUG, { + type: 'concept', + title: 'Default-source seed', + compiled_truth: 'pre-existing default row', + }); + + const md = `--- +type: concept +title: Imported under testsrc +--- + +# Imported under testsrc + +Body content; tags get reconciled inside the transaction. +`; + + // No 21000, no duplicate. Pre-fix this call would have either crashed + // mid-tx (rolling back) OR fabricated a third row at (default, slug). + const result = await importFromContent(engine, IMP_SLUG, md, { + noEmbed: true, + sourceId: 'testsrc', + }); + expect(result.status).toBe('imported'); + + const rows = await engine.executeRaw<{ source_id: string; title: string }>( + `SELECT source_id, title FROM pages WHERE slug = $1 ORDER BY source_id`, + [IMP_SLUG], + ); + expect(rows.length).toBe(2); + expect(rows[0].source_id).toBe('default'); + expect(rows[0].title).toBe('Default-source seed'); + expect(rows[1].source_id).toBe('testsrc'); + expect(rows[1].title).toBe('Imported under testsrc'); + }); + + test('re-importing same content under same sourceId is idempotent (status=skipped)', async () => { + const md = `--- +type: concept +title: Imported under testsrc +--- + +# Imported under testsrc + +Body content; tags get reconciled inside the transaction. +`; + const result = await importFromContent(engine, IMP_SLUG, md, { + noEmbed: true, + sourceId: 'testsrc', + }); + expect(result.status).toBe('skipped'); + }); +}); + +describe('addTimelineEntry source-scoping (Data R1 HIGH 2 fix)', () => { + const TL_SLUG = 'topics/regression-timeline'; + + test('addTimelineEntry with opts.sourceId only writes to the intended source', async () => { + // Set up: same slug under both default and testsrc. + await engine.putPage(TL_SLUG, { type: 'concept', title: 'TL default', compiled_truth: '' }); + await engine.putPage(TL_SLUG, { type: 'concept', title: 'TL testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + + // Pre-fix: bare-slug `INSERT ... SELECT id FROM pages WHERE slug = $1` + // would have inserted timeline rows for BOTH source rows, fanning out + // the entry across sources. + await engine.addTimelineEntry(TL_SLUG, { + date: '2026-05-07', + source: 'test', + summary: 'testsrc-only entry', + detail: 'Should land only under testsrc.', + }, { sourceId: 'testsrc' }); + + const rows = await engine.executeRaw<{ source_id: string; summary: string }>( + `SELECT p.source_id, te.summary + FROM timeline_entries te + JOIN pages p ON p.id = te.page_id + WHERE p.slug = $1`, + [TL_SLUG], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('testsrc'); + expect(rows[0].summary).toBe('testsrc-only entry'); + }); + + test('addTimelineEntry rejects missing source-qualified page', async () => { + let err: Error | null = null; + try { + await engine.addTimelineEntry(TL_SLUG, { + date: '2026-05-08', + source: 'test', + summary: 'bad source', + detail: '', + }, { sourceId: 'nonexistent-src' }); + } catch (e) { + err = e as Error; + } + expect(err).not.toBeNull(); + expect(err!.message).toMatch(/not found/); + }); + + test('addTimelineEntry without opts defaults to source=default (back-compat)', async () => { + await engine.addTimelineEntry(TL_SLUG, { + date: '2026-05-09', + source: 'test', + summary: 'default-source entry', + detail: '', + }); + + const rows = await engine.executeRaw<{ source_id: string; summary: string }>( + `SELECT p.source_id, te.summary + FROM timeline_entries te + JOIN pages p ON p.id = te.page_id + WHERE p.slug = $1 AND te.summary = 'default-source entry'`, + [TL_SLUG], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('default'); + }); +}); + +describe('deletePage + updateSlug source-scoping (Data R2 CRITICAL + HIGH fix)', () => { + const DEL_SLUG = 'topics/regression-delete'; + const REN_FROM = 'topics/regression-rename-from'; + const REN_TO = 'topics/regression-rename-to'; + + test('deletePage with opts.sourceId only deletes the intended source row', async () => { + // Set up: same slug under both default and testsrc. + await engine.putPage(DEL_SLUG, { type: 'concept', title: 'D default', compiled_truth: '' }); + await engine.putPage(DEL_SLUG, { type: 'concept', title: 'D testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + + // Pre-fix: bare `DELETE FROM pages WHERE slug = $1` would have hard-deleted + // BOTH rows across sources. Post-fix: only the testsrc row goes. + await engine.deletePage(DEL_SLUG, { sourceId: 'testsrc' }); + + const rows = await engine.executeRaw<{ source_id: string }>( + `SELECT source_id FROM pages WHERE slug = $1`, + [DEL_SLUG], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('default'); + }); + + test('deletePage without opts targets source=default only (back-compat)', async () => { + // Recreate the testsrc row to test that default-source delete leaves it. + await engine.putPage(DEL_SLUG, { type: 'concept', title: 'D testsrc back', compiled_truth: '' }, { sourceId: 'testsrc' }); + await engine.deletePage(DEL_SLUG); // no opts → defaults to 'default' + + const rows = await engine.executeRaw<{ source_id: string }>( + `SELECT source_id FROM pages WHERE slug = $1`, + [DEL_SLUG], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('testsrc'); + }); + + test('updateSlug with opts.sourceId only renames the intended source row', async () => { + // Set up: same slug under both default and testsrc. + await engine.putPage(REN_FROM, { type: 'concept', title: 'R default', compiled_truth: '' }); + await engine.putPage(REN_FROM, { type: 'concept', title: 'R testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + + // Pre-fix: bare `UPDATE pages SET slug = $new WHERE slug = $old` would have + // hit both rows; if REN_TO already existed in either source, the (source_id, + // slug) UNIQUE would fail. Post-fix: only the testsrc row gets renamed. + await engine.updateSlug(REN_FROM, REN_TO, { sourceId: 'testsrc' }); + + const fromRows = await engine.executeRaw<{ source_id: string }>( + `SELECT source_id FROM pages WHERE slug = $1 ORDER BY source_id`, + [REN_FROM], + ); + expect(fromRows.length).toBe(1); + expect(fromRows[0].source_id).toBe('default'); + + const toRows = await engine.executeRaw<{ source_id: string }>( + `SELECT source_id FROM pages WHERE slug = $1`, + [REN_TO], + ); + expect(toRows.length).toBe(1); + expect(toRows[0].source_id).toBe('testsrc'); + }); + + test('getChunks with opts.sourceId only returns the intended source\'s chunks', async () => { + // Set up: same slug under both default and testsrc, each with distinct chunks. + const CHUNK_SLUG = 'topics/regression-getchunks'; + await engine.putPage(CHUNK_SLUG, { type: 'concept', title: 'C default', compiled_truth: '' }); + await engine.putPage(CHUNK_SLUG, { type: 'concept', title: 'C testsrc', compiled_truth: '' }, { sourceId: 'testsrc' }); + await engine.upsertChunks(CHUNK_SLUG, [ + { chunk_index: 0, chunk_text: 'default chunk text', chunk_source: 'compiled_truth' }, + ], { sourceId: 'default' }); + await engine.upsertChunks(CHUNK_SLUG, [ + { chunk_index: 0, chunk_text: 'testsrc chunk text', chunk_source: 'compiled_truth' }, + ], { sourceId: 'testsrc' }); + + // Pre-fix: bare-slug `WHERE p.slug = $1` returned BOTH source's chunks + // mashed together. importCodeFile uses getChunks for incremental embedding + // reuse; pre-fix would have grabbed the wrong source's embeddings. + const defaultChunks = await engine.getChunks(CHUNK_SLUG, { sourceId: 'default' }); + expect(defaultChunks.length).toBe(1); + expect(defaultChunks[0].chunk_text).toBe('default chunk text'); + + const testsrcChunks = await engine.getChunks(CHUNK_SLUG, { sourceId: 'testsrc' }); + expect(testsrcChunks.length).toBe(1); + expect(testsrcChunks[0].chunk_text).toBe('testsrc chunk text'); + }); + + test('updateSlug without opts targets source=default only (back-compat)', async () => { + // Default still has REN_FROM. Rename it without opts; testsrc REN_TO + // already exists, so a bare rename would fail (source_id, slug) UNIQUE + // when both default and testsrc converge on REN_TO. Source-scoped rename + // succeeds because testsrc is untouched. + const REN_TO_2 = 'topics/regression-rename-to-2'; + await engine.updateSlug(REN_FROM, REN_TO_2); + + const rows = await engine.executeRaw<{ source_id: string; slug: string }>( + `SELECT source_id, slug FROM pages WHERE slug IN ($1, $2) ORDER BY source_id`, + [REN_FROM, REN_TO_2], + ); + expect(rows.length).toBe(1); + expect(rows[0].source_id).toBe('default'); + expect(rows[0].slug).toBe(REN_TO_2); + }); +}); From 4dc9dcfed39fbab06f548f40a68ec53f99c0f273 Mon Sep 17 00:00:00 2001 From: Jeremy Knows Date: Fri, 8 May 2026 17:36:01 -0400 Subject: [PATCH 18/41] fix(multi-source): plumb sourceId through performFullSync (PR #707 gap) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #707 fixed source_id routing for sync's incremental loop (lines 581/641) but performFullSync (line 922) calls runImport without threading sourceId. Result: full syncs route pages to default even with --source . Verified on v0.30.1 by direct PGLite probe after `gbrain sync --source X --full`: all pages landed in default, not the named source. Fix: - runImport accepts sourceId in opts (programmatic only — no CLI flag, preserving PR #707's design intent of `gbrain import` being default-only). - runImport threads sourceId to importFile + importImageFile. - performFullSync passes opts.sourceId to runImport. - ImportImageOptions type accepts sourceId for runImport branch (importImageFile body wiring deferred — image imports out of scope for current use case; TS error fix only). Verified: real sync test against /tmp/test-sync routes 1 page to "testsync" source, 0 to default (post-fix). 19/19 source-id regression tests still pass. Typecheck clean. Co-Authored-By: Claude Opus 4.7 --- src/commands/import.ts | 11 ++++++++--- src/commands/sync.ts | 5 ++++- src/core/import-file.ts | 6 ++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/commands/import.ts b/src/commands/import.ts index 58992f11a..b7b8fe2a4 100644 --- a/src/commands/import.ts +++ b/src/commands/import.ts @@ -28,10 +28,15 @@ export interface RunImportResult { failures: Array<{ path: string; error: string }>; } -export async function runImport(engine: BrainEngine, args: string[], opts: { commit?: string } = {}): Promise { +export async function runImport(engine: BrainEngine, args: string[], opts: { commit?: string; sourceId?: string } = {}): Promise { const noEmbed = args.includes('--no-embed'); const fresh = args.includes('--fresh'); const jsonOutput = args.includes('--json'); + // v0.30.x follow-up to PR #707: programmatic sourceId support so internal + // callers (performFullSync, future Step 6 paths) can route to a named + // source. The CLI `gbrain import` deliberately has no --source flag per + // PR #707's design intent — only programmatic callers thread sourceId. + const sourceId = opts.sourceId; const workersIdx = args.indexOf('--workers'); const workersArg = workersIdx !== -1 ? args[workersIdx + 1] : null; // v0.22.13 (PR #490 Q2): shared parseWorkers helper rejects bad input @@ -110,8 +115,8 @@ export async function runImport(engine: BrainEngine, args: string[], opts: { com // up images when GBRAIN_EMBEDDING_MULTIMODAL=true so this branch is // unreachable when the gate is off; defense-in-depth check anyway. const result = isImageFilePath(relativePath) && process.env.GBRAIN_EMBEDDING_MULTIMODAL === 'true' - ? await importImageFile(eng, filePath, relativePath, { noEmbed }) - : await importFile(eng, filePath, relativePath, { noEmbed }); + ? await importImageFile(eng, filePath, relativePath, { noEmbed, sourceId }) + : await importFile(eng, filePath, relativePath, { noEmbed, sourceId }); if (result.status === 'imported') { imported++; chunksCreated += result.chunks; diff --git a/src/commands/sync.ts b/src/commands/sync.ts index 7f5c8da81..c9acf0f2d 100644 --- a/src/commands/sync.ts +++ b/src/commands/sync.ts @@ -965,7 +965,10 @@ async function performFullSync( const importArgs = [repoPath]; if (opts.noEmbed) importArgs.push('--no-embed'); if (fullConcurrency > 1) importArgs.push('--workers', String(fullConcurrency)); - const result = await runImport(engine, importArgs, { commit: headCommit }); + // v0.30.x follow-up to PR #707: thread sourceId through runImport's opts + // so performFullSync routes pages to the named source (the incremental + // sync path in this same file already does this on lines 581/641). + const result = await runImport(engine, importArgs, { commit: headCommit, sourceId: opts.sourceId }); // Bug 9 — gate the full-sync bookmark on success. runImport already // writes its own sync.last_commit conditionally (import.ts), but diff --git a/src/core/import-file.ts b/src/core/import-file.ts index 20e6fbf13..59d552708 100644 --- a/src/core/import-file.ts +++ b/src/core/import-file.ts @@ -886,6 +886,12 @@ export interface ImportImageOptions { ocrConcurrency?: number; /** Skip the embed call (for tests that want fast metadata-only inserts). */ noEmbed?: boolean; + /** + * v0.30.x follow-up to PR #707: route image-page writes to a named source. + * Mirrors importFromContent's threading; without this, runImport callers + * with sourceId would TS-error on the importImageFile branch. + */ + sourceId?: string; } /** Module-level limiter so concurrent imports across files share the budget. */ From fc50cc0c902453bf1f6aff6e9f6354c36f2471e5 Mon Sep 17 00:00:00 2001 From: Jeremy Knows Date: Fri, 8 May 2026 20:02:28 -0400 Subject: [PATCH 19/41] test: regression test for performFullSync sourceId threading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #707's existing 19-test suite at test/source-id-tx-regression.test.ts covers the engine-layer transaction surface (putPage / addTag / etc.) but does NOT exercise commands/sync.ts:performFullSync. Verified via `grep -c 'performFullSync' test/source-id-tx-regression.test.ts → 0`. This means the +18/-4 fix at sync.ts:892 (performFullSync passing sourceId to runImport) had no automated coverage. Adds 2 PGLite-only regression tests: 1. `performFullSync with --source routes pages to named source (not default)` — fixture: temp git repo with 2 markdown files. Calls performSync with { full: true, sourceId: 'testsrc-pfs', noPull: true, noEmbed: true }. Asserts pages.source_id = 'testsrc-pfs', not 'default'. Pre-fix: FAILS (verified by checking out 46cd197 — rebased PR #707 only, without my gap-fix — and running this test). Post-fix: PASSES. 2. `performFullSync WITHOUT --source still targets default (back-compat)` — same fixture, no sourceId opt. Asserts pages.source_id = 'default'. Both pre-fix and post-fix: PASSES (back-compat preserved by the fix). Verified: 21/21 tests pass on this branch (19 from PR #707 + 2 new). `bun run typecheck` clean. `bun run verify` clean (8 guard checks pass). Co-Authored-By: Claude Opus 4.7 --- test/performfullsync-source-id.test.ts | 139 +++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 test/performfullsync-source-id.test.ts diff --git a/test/performfullsync-source-id.test.ts b/test/performfullsync-source-id.test.ts new file mode 100644 index 000000000..cdde81b48 --- /dev/null +++ b/test/performfullsync-source-id.test.ts @@ -0,0 +1,139 @@ +/** + * v0.30.x follow-up to PR #707 — performFullSync source_id threading regression test. + * + * Pre-fix bug: + * - PR #707 fixed source_id routing for sync's incremental loop (sync.ts:581 + 641), + * but `performFullSync` (the path `--full` invokes) at sync.ts:892 called + * `runImport(engine, importArgs, { commit: headCommit })` without threading sourceId. + * - Result: `gbrain sync --source X --full` updated `sources.last_sync_at` to look + * like binding worked, but actual page rows landed in source_id='default'. + * - The 19 tests at test/source-id-tx-regression.test.ts validate the engine-layer + * transaction surface (putPage / addTag / etc.) but do NOT exercise performFullSync. + * Confirmed via: grep -c 'performFullSync' test/source-id-tx-regression.test.ts → 0. + * + * Fix (this PR-E follow-up to PR #707): + * - runImport accepts opts.sourceId (programmatic-only — no CLI flag, preserves + * PR #707's design intent of `gbrain import` being default-only). + * - runImport threads sourceId to importFile + importImageFile. + * - performFullSync passes opts.sourceId to runImport. + * - ImportImageOptions type accepts sourceId (TS-only fix; image-import body + * wiring deferred — out of scope here, marked as a separate PR-C-style follow-up). + * + * This test verifies the sync-command-layer fix end-to-end on PGLite. + * + * Discovered: 2026-05-08 PRISM Round 2 Performance review on + * `~/atlas/agents/terminal/docs/atlas-needs-from-gbrain-spec-v2.1-2026-05-08.md` + * by Atlas Terminal agent. Test required as PR-E acceptance criterion. + */ + +import { describe, test, expect, beforeAll, afterAll, beforeEach, afterEach } from 'bun:test'; +import { mkdtempSync, writeFileSync, rmSync, mkdirSync } from 'fs'; +import { execSync } from 'child_process'; +import { tmpdir } from 'os'; +import { join } from 'path'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { runSources } from '../src/commands/sources.ts'; +import { resetPgliteState } from './helpers/reset-pglite.ts'; + +let engine: PGLiteEngine; +let repoPath: string; + +async function pageCountBySource(): Promise> { + const rows = await engine.executeRaw<{ source_id: string; n: number }>( + `SELECT source_id, COUNT(*)::int AS n FROM pages GROUP BY source_id`, + ); + const out: Record = {}; + for (const r of rows) out[r.source_id] = r.n; + return out; +} + +describe('performFullSync threads sourceId end-to-end', () => { + beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); + await runSources(engine, ['add', 'testsrc-pfs', '--no-federated']); + }, 60_000); + + afterAll(async () => { + if (engine) await engine.disconnect(); + }, 60_000); + + beforeEach(async () => { + await resetPgliteState(engine); + // resetPgliteState clears pages but doesn't drop the source row; re-add only if missing + const sources = await engine.executeRaw<{ id: string }>(`SELECT id FROM sources WHERE id = 'testsrc-pfs'`); + if (sources.length === 0) { + await runSources(engine, ['add', 'testsrc-pfs', '--no-federated']); + } + + repoPath = mkdtempSync(join(tmpdir(), 'gbrain-pfs-')); + execSync('git init', { cwd: repoPath, stdio: 'pipe' }); + execSync('git config user.email "test@test.com"', { cwd: repoPath, stdio: 'pipe' }); + execSync('git config user.name "Test"', { cwd: repoPath, stdio: 'pipe' }); + mkdirSync(join(repoPath, 'topics'), { recursive: true }); + writeFileSync(join(repoPath, 'topics/foo.md'), [ + '---', + 'type: concept', + 'title: Foo Topic', + '---', + '', + 'Test content for performFullSync source binding.', + ].join('\n')); + writeFileSync(join(repoPath, 'topics/bar.md'), [ + '---', + 'type: concept', + 'title: Bar Topic', + '---', + '', + 'Second test page to verify multi-page routing.', + ].join('\n')); + execSync('git add -A && git commit -m "initial"', { cwd: repoPath, stdio: 'pipe' }); + }); + + afterEach(() => { + if (repoPath) rmSync(repoPath, { recursive: true, force: true }); + }); + + test('performFullSync with --source routes pages to named source (not default)', async () => { + const { performSync } = await import('../src/commands/sync.ts'); + const result = await performSync(engine, { + repoPath, + full: true, + sourceId: 'testsrc-pfs', + noPull: true, + noEmbed: true, + }); + + // status is 'first_sync' for fresh imports, 'synced' for incremental — accept both + expect(['first_sync', 'synced']).toContain(result.status); + expect(result.added).toBeGreaterThan(0); + + const counts = await pageCountBySource(); + // Pre-fix bug: pages would land in 'default' (sources.last_sync_at would still + // update on testsrc-pfs, making the gap silent at the sources-list level). + // Post-fix: pages land in 'testsrc-pfs'. + expect(counts['testsrc-pfs']).toBeGreaterThan(0); + expect(counts['default'] ?? 0).toBe(0); + }); + + test('performFullSync WITHOUT --source still targets default (back-compat preserved)', async () => { + const { performSync } = await import('../src/commands/sync.ts'); + const result = await performSync(engine, { + repoPath, + full: true, + // no sourceId — expect default-source behavior + noPull: true, + noEmbed: true, + }); + + // status is 'first_sync' for fresh imports, 'synced' for incremental — accept both + expect(['first_sync', 'synced']).toContain(result.status); + expect(result.added).toBeGreaterThan(0); + + const counts = await pageCountBySource(); + // Back-compat: callers that omit sourceId continue to target source 'default'. + expect(counts['default']).toBeGreaterThan(0); + expect(counts['testsrc-pfs'] ?? 0).toBe(0); + }); +}); From 9f7103a95de49efd504f224ba2aeb1af04fc5bca Mon Sep 17 00:00:00 2001 From: gus Date: Thu, 7 May 2026 22:24:58 -0300 Subject: [PATCH 20/41] fix(privacy): strip takes fence from get_page / get_versions when token carries an allow-list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.28.6 (#563) introduced the per-token takes-holder allow-list: an OAuth token carries `permissions.takes_holders` and `takes_list` / `takes_search` / `think.gather` filter take rows server-side via `WHERE t.holder = ANY($allowList)` in both engines. But take rows are stored in two places per the explicit contract in `extract-takes.ts:5-13` ("markdown is canonical, the takes table is a derived index"): the structured `takes` table AND inline in `pages.compiled_truth` between `` markers as a markdown table whose `who` column IS the holder. A read-only token whose `takes_holders` is `["world"]` (the documented default-deny posture from migrate.ts:1221) can call `get_page ` and recover every non-`world` claim verbatim from the body — private hunches, founder bets, non-public sourcing notes. `get_versions` has the same shape: snapshots persist historical compiled_truth verbatim, so a caller blocked at `get_page` falls through to /history. The team already shipped a complementary fix in `chunkers/recursive.ts:49` (stripTakesFence applied before the body is chunked, so `query` results don't leak fence content). Migration v38 documents this as a "complementary fix" — the page-CRUD surface was missed. Fix strips the fence at the op layer when `ctx.takesHoldersAllowList` is set (i.e. the remote MCP path). Local CLI callers leave the field unset and keep seeing the full fence. const visibleBody = ctx.takesHoldersAllowList ? { ...page, compiled_truth: stripTakesFence(page.compiled_truth) } : page; Same shape on `get_versions` over every snapshot in the array. Re-rendering the fence with allow-list-filtered rows would require joining the takes table per version_id and inverts the markdown-canonical contract; whole-fence strip is the conservative posture that closes the leak. A future allow-list-aware re-render is an additive change that won't break the contract pinned by these tests. Test coverage in `test/takes-mcp-allowlist.serial.test.ts`: - get_page with allow-list strips fence; surrounding body kept. - get_page without allow-list (local CLI) keeps fence (back-compat). - get_page fuzzy resolution path also strips for remote tokens. - get_versions with allow-list strips fence on every snapshot. - get_versions without allow-list returns historical content intact. The pre-fix R12 PoC reported `LEAKED garry hidden take? YES` and `LEAKED brain hidden take? YES`; post-fix the same PoC reports `no` for both holders and "bypass did not reproduce". --- src/core/operations.ts | 23 +++++- test/takes-mcp-allowlist.serial.test.ts | 98 +++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/core/operations.ts b/src/core/operations.ts index 70fc9c546..20edce36f 100644 --- a/src/core/operations.ts +++ b/src/core/operations.ts @@ -16,6 +16,7 @@ import { dedupResults } from './search/dedup.ts'; import { captureEvalCandidate, isEvalCaptureEnabled, isEvalScrubEnabled } from './eval-capture.ts'; import type { HybridSearchMeta } from './types.ts'; import { extractPageLinks, isAutoLinkEnabled, isAutoTimelineEnabled, parseTimelineEntries, makeResolver, type UnresolvedFrontmatterRef } from './link-extraction.ts'; +import { stripTakesFence } from './takes-fence.ts'; import * as db from './db.ts'; import { GET_RECENT_SALIENCE_DESCRIPTION, @@ -363,7 +364,19 @@ const get_page: Operation = { } const tags = await ctx.engine.getTags(page.slug); - return { ...page, tags, ...(resolved_slug ? { resolved_slug } : {}) }; + // Privacy boundary for the per-token takes-holder allow-list (v0.28.6). + // takes_list / takes_search / think.gather filter rows by holder at the + // SQL layer, but takes are also rendered as a markdown table inside the + // page body between TAKES_FENCE markers — `extract-takes.ts` ("markdown + // is canonical, the takes table is a derived index"). A read-only token + // restricted to e.g. `world` could call `get_page ` and recover + // every non-`world` claim verbatim from the body. Strip the fence here + // when the caller carries an allow-list (i.e. the remote MCP path). + // Local CLI callers leave takesHoldersAllowList unset and see the fence. + const visibleBody = ctx.takesHoldersAllowList + ? { ...page, compiled_truth: stripTakesFence(page.compiled_truth) } + : page; + return { ...visibleBody, tags, ...(resolved_slug ? { resolved_slug } : {}) }; }, scope: 'read', cliHints: { name: 'get', positional: ['slug'] }, @@ -1407,7 +1420,13 @@ const get_versions: Operation = { slug: { type: 'string', required: true }, }, handler: async (ctx, p) => { - return ctx.engine.getVersions(p.slug as string); + const versions = await ctx.engine.getVersions(p.slug as string); + // Same takes-allow-list privacy boundary as get_page. Snapshots persist + // historical compiled_truth verbatim, including the takes fence, so + // a remote token bypassing get_page via /history would re-introduce + // the same leak across every prior version. + if (!ctx.takesHoldersAllowList) return versions; + return versions.map(v => ({ ...v, compiled_truth: stripTakesFence(v.compiled_truth) })); }, scope: 'read', cliHints: { name: 'history', positional: ['slug'] }, diff --git a/test/takes-mcp-allowlist.serial.test.ts b/test/takes-mcp-allowlist.serial.test.ts index 6a1748e1a..d168a376a 100644 --- a/test/takes-mcp-allowlist.serial.test.ts +++ b/test/takes-mcp-allowlist.serial.test.ts @@ -15,6 +15,7 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { PGLiteEngine } from '../src/core/pglite-engine.ts'; import { dispatchToolCall } from '../src/mcp/dispatch.ts'; +import { TAKES_FENCE_BEGIN, TAKES_FENCE_END } from '../src/core/takes-fence.ts'; let engine: PGLiteEngine; let alicePageId: number; @@ -105,6 +106,103 @@ describe('per-token takes-holder allow-list — takes_search', () => { }); }); +// --------------------------------------------------------------------------- +// Page-body channel: get_page / get_versions must respect the same allow-list. +// Take rows are stored in TWO places per the extract-takes contract: the +// `takes` table (filtered by the SQL `holder = ANY($allowList)` clause) and +// inline in `pages.compiled_truth` between TAKES_FENCE markers as a markdown +// table. Without a strip on the page-CRUD path, a `world`-only token reading +// `get_page ` recovers every non-`world` claim verbatim from the body. +// --------------------------------------------------------------------------- + +describe('per-token takes-holder allow-list — get_page body channel', () => { + const SLUG = 'people/bob-example'; + const FENCE_BODY = + '## Takes\n\n' + + `${TAKES_FENCE_BEGIN}\n` + + '\n| # | claim | kind | who | weight | since | source |\n' + + '|---|---|---|---|---|---|---|\n' + + '| 1 | CEO of Widget | fact | world | 1.0 | 2017-01 | Crustdata |\n' + + '| 2 | Strong technical founder | take | garry | 0.85 | 2026-04-29 | OH |\n' + + '| 3 | Seemed burned out in last OH | hunch | brain | 0.4 | 2026-05-01 | private |\n\n' + + `${TAKES_FENCE_END}\n` + + '\nFooter content stays.\n'; + + beforeAll(async () => { + await engine.putPage(SLUG, { title: 'Bob', type: 'person', compiled_truth: FENCE_BODY }); + }); + + test('remote token with allow-list strips fence from compiled_truth', async () => { + const result = await dispatchToolCall(engine, 'get_page', { slug: SLUG }, { + remote: true, + takesHoldersAllowList: ['world'], + }); + const page = parseResult(result) as { compiled_truth: string }; + expect(page.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).not.toContain(TAKES_FENCE_END); + expect(page.compiled_truth).not.toContain('Strong technical founder'); + expect(page.compiled_truth).not.toContain('Seemed burned out'); + expect(page.compiled_truth).not.toContain('| garry |'); + expect(page.compiled_truth).not.toContain('| brain |'); + // Surrounding body kept intact. + expect(page.compiled_truth).toContain('Footer content stays.'); + }); + + test('local CLI (no allow-list) preserves the fence — backwards compatibility', async () => { + const result = await dispatchToolCall(engine, 'get_page', { slug: SLUG }, { + remote: false, + }); + const page = parseResult(result) as { compiled_truth: string }; + expect(page.compiled_truth).toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).toContain('Seemed burned out'); + }); + + test('fuzzy resolution path also strips for remote token', async () => { + const result = await dispatchToolCall(engine, 'get_page', { slug: 'people/bob-example', fuzzy: true }, { + remote: true, + takesHoldersAllowList: ['world', 'garry'], + }); + const page = parseResult(result) as { compiled_truth: string }; + // Allow-list does not yet re-render filtered rows; whole fence is stripped. + // Pinned so future re-rendering work is an additive change, not a silent + // semantic flip. + expect(page.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).not.toContain('Strong technical founder'); + }); +}); + +describe('per-token takes-holder allow-list — get_versions body channel', () => { + const SLUG = 'people/carol-example'; + const FENCE_BODY = + `${TAKES_FENCE_BEGIN}\n| # | claim | kind | who |\n|---|---|---|---|\n| 1 | private hunch | hunch | brain |\n${TAKES_FENCE_END}\n`; + + beforeAll(async () => { + await engine.putPage(SLUG, { title: 'Carol', type: 'person', compiled_truth: FENCE_BODY }); + await engine.createVersion(SLUG); // snapshot now has the fence + }); + + test('remote token with allow-list strips fence from every snapshot', async () => { + const result = await dispatchToolCall(engine, 'get_versions', { slug: SLUG }, { + remote: true, + takesHoldersAllowList: ['world'], + }); + const versions = parseResult(result) as Array<{ compiled_truth: string }>; + expect(versions.length).toBeGreaterThan(0); + for (const v of versions) { + expect(v.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(v.compiled_truth).not.toContain('private hunch'); + } + }); + + test('local CLI sees historical takes in snapshots', async () => { + const result = await dispatchToolCall(engine, 'get_versions', { slug: SLUG }, { + remote: false, + }); + const versions = parseResult(result) as Array<{ compiled_truth: string }>; + expect(versions.some(v => v.compiled_truth.includes('private hunch'))).toBe(true); + }); +}); + describe('think op — read-only on remote callers (Lane D landed)', () => { test('remote save/take is forced read-only via remote_persisted_blocked flag', async () => { // Without ANTHROPIC_API_KEY, runThink returns gather-only result with NO_ANTHROPIC_API_KEY warning. From bee0129b772972fd255d9790a61a0b8857c82ff2 Mon Sep 17 00:00:00 2001 From: joelwp Date: Fri, 8 May 2026 09:47:22 -0600 Subject: [PATCH 21/41] Fix double-encoded jsonb in subagent_tool_executions breaking slug lookup persistToolExecPending/Failed/Complete called JSON.stringify(input) before passing to a $N::jsonb parameter. When input is already an object, this produces a JSON string which ::jsonb stores as a jsonb scalar -- not a jsonb object. Downstream queries like input->>slug then return NULL because the operator does not traverse scalar strings. Root cause fix: skip JSON.stringify when input is already a string. Query fix: use COALESCE with (input #>> '{}')::jsonb->>slug fallback to handle both old double-encoded rows and new properly-encoded rows. Affects: dream cycle synthesize phase (pages_written always 0) and patterns phase (same slug collection query). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/core/cycle/patterns.ts | 6 ++++-- src/core/cycle/synthesize.ts | 10 ++++++---- src/core/minions/handlers/subagent.ts | 10 +++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/core/cycle/patterns.ts b/src/core/cycle/patterns.ts index f3d281a29..8524294ed 100644 --- a/src/core/cycle/patterns.ts +++ b/src/core/cycle/patterns.ts @@ -223,13 +223,15 @@ async function collectChildPutPageSlugs( childIds: number[], ): Promise { if (childIds.length === 0) return []; + // Handle both properly-stored jsonb objects (input->>'slug') and + // double-encoded jsonb strings from pre-fix data ((input #>> '{}')::jsonb->>'slug'). const rows = await engine.executeRaw<{ slug: string }>( - `SELECT DISTINCT input->>'slug' AS slug + `SELECT DISTINCT + COALESCE(input->>'slug', (input #>> '{}')::jsonb->>'slug') AS slug FROM subagent_tool_executions WHERE job_id = ANY($1::int[]) AND tool_name = 'brain_put_page' AND status = 'complete' - AND input ? 'slug' ORDER BY 1`, [childIds], ); diff --git a/src/core/cycle/synthesize.ts b/src/core/cycle/synthesize.ts index 76198439f..e26787808 100644 --- a/src/core/cycle/synthesize.ts +++ b/src/core/cycle/synthesize.ts @@ -779,14 +779,16 @@ async function collectChildPutPageSlugs( ): Promise { if (childIds.length === 0) return []; // Raw fetch — NO SELECT DISTINCT. Preserves per-child slug duplicates so - // the orchestrator sees what each child wrote. + // the orchestrator sees what each child wrote. COALESCE handles both + // properly-stored jsonb objects (input->>'slug') and double-encoded jsonb + // strings from pre-fix data ((input #>> '{}')::jsonb->>'slug'). const rows = await engine.executeRaw<{ job_id: number; slug: string }>( - `SELECT job_id, input->>'slug' AS slug + `SELECT job_id, + COALESCE(input->>'slug', (input #>> '{}')::jsonb->>'slug') AS slug FROM subagent_tool_executions WHERE job_id = ANY($1::int[]) AND tool_name = 'brain_put_page' - AND status = 'complete' - AND input ? 'slug'`, + AND status = 'complete'`, [childIds], ); const rewritten = new Set(); diff --git a/src/core/minions/handlers/subagent.ts b/src/core/minions/handlers/subagent.ts index 4418e31d7..36c1edad3 100644 --- a/src/core/minions/handlers/subagent.ts +++ b/src/core/minions/handlers/subagent.ts @@ -630,11 +630,15 @@ async function persistToolExecPending( toolName: string, input: unknown, ): Promise { + // Serialize to JSON string for the ::jsonb cast. When `input` is already a + // string (e.g. pre-serialized), avoid double-encoding which produces a jsonb + // scalar string instead of a jsonb object — breaking `input->>'key'` lookups. + const jsonStr = typeof input === 'string' ? input : JSON.stringify(input); await engine.executeRaw( `INSERT INTO subagent_tool_executions (job_id, message_idx, tool_use_id, tool_name, input, status) VALUES ($1, $2, $3, $4, $5::jsonb, 'pending') ON CONFLICT (job_id, tool_use_id) DO NOTHING`, - [jobId, messageIdx, toolUseId, toolName, JSON.stringify(input)], + [jobId, messageIdx, toolUseId, toolName, jsonStr], ); } @@ -648,7 +652,7 @@ async function persistToolExecComplete( `UPDATE subagent_tool_executions SET status = 'complete', output = $3::jsonb, ended_at = now() WHERE job_id = $1 AND tool_use_id = $2`, - [jobId, toolUseId, JSON.stringify(output)], + [jobId, toolUseId, typeof output === 'string' ? output : JSON.stringify(output)], ); } @@ -668,7 +672,7 @@ async function persistToolExecFailed( VALUES ($1, $2, $3, $4, $5::jsonb, 'failed', $6, now()) ON CONFLICT (job_id, tool_use_id) DO UPDATE SET status = 'failed', error = EXCLUDED.error, ended_at = now()`, - [jobId, messageIdx, toolUseId, toolName, JSON.stringify(input), error], + [jobId, messageIdx, toolUseId, toolName, typeof input === 'string' ? input : JSON.stringify(input), error], ); } From 74b0db3ab1cc21b85c2f87c5e4fb18210e91588e Mon Sep 17 00:00:00 2001 From: Federico Cachero Date: Thu, 7 May 2026 16:14:44 -0300 Subject: [PATCH 22/41] fix(adapter/voyage): translate request/response between OpenAI-compat SDK and Voyage's actual contract The @ai-sdk/openai-compatible package treats Voyage as if it were OpenAI-shaped, but Voyage's /v1/embeddings endpoint diverges in three places that combine into a hard-blocking incompatibility: OUTBOUND request: - 'encoding_format=float' (SDK default) is rejected; Voyage only accepts 'base64' - 'dimensions' parameter (OpenAI name) is rejected; Voyage uses 'output_dimension' INBOUND response: - With encoding_format=base64, 'embedding' is returned as a base64 string, but the SDK's Zod schema (openaiTextEmbeddingResponseSchema) expects an 'array of number'. The schema fails with 'Invalid JSON response' even though the JSON is well-formed. - 'usage' lacks 'prompt_tokens'; the schema requires it when usage is present. Without this patch, ALL embedding requests to Voyage fail. Reproducible by running 'gbrain put < text' with embedding_model=voyage:voyage-* and any current voyage model (voyage-3-large, voyage-3, voyage-4-large). Solution: pass a custom 'fetch' to createOpenAICompatible only when recipe.id === 'voyage'. The fetch wrapper: 1. Forces encoding_format='base64' on outbound (Voyage's only accepted value) 2. Translates dimensions -> output_dimension on outbound 3. Drops Content-Length so the runtime recomputes from the mutated body 4. Decodes base64 embeddings to Float32 arrays on inbound (so the Zod schema sees what it expects) 5. Synthesizes prompt_tokens from total_tokens when missing This is a minimal, targeted fix. It only activates for Voyage and falls through cleanly for all other providers. No public API changes. --- src/core/ai/gateway.ts | 97 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/src/core/ai/gateway.ts b/src/core/ai/gateway.ts index 0db796f66..cdea47769 100644 --- a/src/core/ai/gateway.ts +++ b/src/core/ai/gateway.ts @@ -234,6 +234,96 @@ export function isAvailable(touchpoint: TouchpointKind): boolean { // ---- Embedding ---- +/** + * Voyage AI compatibility shim. Voyage's `/v1/embeddings` endpoint is OpenAI-shaped + * but diverges on two parameters: + * - `encoding_format` only accepts `'base64'` (the AI SDK sends `'float'` by default, + * which makes Voyage respond with HTTP 400). Force `'base64'` so the SDK round-trip + * parses correctly. + * - OpenAI's `dimensions` parameter is rejected; Voyage uses `output_dimension`. + * Translate the field name when the caller explicitly requested a dimension. + * + * The mutated body is what gets sent on the wire; the AI SDK still receives a + * base64-encoded response and decodes it as expected. + */ +const voyageCompatFetch: typeof fetch = async (input, init) => { + // OUTBOUND: rewrite request body for Voyage's actual API contract. + if (init?.body && typeof init.body === 'string') { + try { + const parsed = JSON.parse(init.body); + if (parsed && typeof parsed === 'object') { + let mutated = false; + // Voyage rejects 'float' (the SDK default). Force the value Voyage accepts. + if (parsed.encoding_format !== 'base64') { + parsed.encoding_format = 'base64'; + mutated = true; + } + // Translate OpenAI's `dimensions` to Voyage's `output_dimension`. + if ('dimensions' in parsed) { + const dims = parsed.dimensions; + delete parsed.dimensions; + if (typeof dims === 'number') parsed.output_dimension = dims; + mutated = true; + } + if (mutated) { + const newBody = JSON.stringify(parsed); + // Drop Content-Length so fetch recomputes from the new body. + const headers = new Headers(init.headers ?? {}); + headers.delete('content-length'); + init = { ...init, body: newBody, headers }; + } + } + } catch { + // Body wasn't JSON — pass through untouched. + } + } + + const resp = await fetch(input, init); + if (!resp.ok) return resp; + const ct = resp.headers.get('content-type') ?? ''; + if (!ct.toLowerCase().includes('application/json')) return resp; + + // INBOUND: rewrite response so the AI SDK's Zod schema validates. + // Voyage diverges from OpenAI in two places that break the parser: + // - `embedding` is a base64 string (SDK schema expects `number[]`) + // - `usage` lacks `prompt_tokens` (SDK schema requires it when usage present) + try { + const json: any = await resp.clone().json(); + if (!json || typeof json !== 'object') return resp; + let modified = false; + if (Array.isArray(json.data)) { + for (const item of json.data) { + if (item && typeof item.embedding === 'string') { + // Voyage returns Float32 little-endian base64. + const bytes = Buffer.from(item.embedding, 'base64'); + const floats = new Float32Array( + bytes.buffer, + bytes.byteOffset, + Math.floor(bytes.byteLength / 4), + ); + item.embedding = Array.from(floats); + modified = true; + } + } + } + if (json.usage && typeof json.usage === 'object' && json.usage.prompt_tokens === undefined) { + json.usage.prompt_tokens = typeof json.usage.total_tokens === 'number' + ? json.usage.total_tokens + : 0; + modified = true; + } + if (!modified) return resp; + return new Response(JSON.stringify(json), { + status: resp.status, + statusText: resp.statusText, + headers: resp.headers, + }); + } catch { + // If parsing/transformation fails, fall back to the original response. + return resp; + } +}; + async function resolveEmbeddingProvider(modelStr: string): Promise<{ model: any; recipe: Recipe; modelId: string }> { const { parsed, recipe } = resolveRecipe(modelStr); assertTouchpoint(recipe, 'embedding', parsed.modelId); @@ -297,6 +387,13 @@ function instantiateEmbedding(recipe: Recipe, modelId: string, cfg: AIGatewayCon name: recipe.id, baseURL: baseUrl, apiKey: apiKey ?? 'unauthenticated', + // Voyage AI's `/v1/embeddings` endpoint is "OpenAI-compatible" only in URL + // shape; it rejects `encoding_format=float` (only `base64` is accepted) and + // ignores OpenAI's `dimensions` parameter (Voyage uses `output_dimension`). + // The default openai-compatible client sends `encoding_format=float`, which + // makes Voyage respond with HTTP 400 "Bad Request". Strip those fields + // before forwarding when targeting Voyage. + fetch: recipe.id === 'voyage' ? voyageCompatFetch : undefined, }); return client.textEmbeddingModel(modelId); } From 537df0805eaf88eeec96c6527c0fbd59c0429027 Mon Sep 17 00:00:00 2001 From: NineClaws Brain Date: Thu, 7 May 2026 13:57:10 +0000 Subject: [PATCH 23/41] feat(dream): support .md files in transcript discovery Transcript discovery only accepted .txt files. Many brain repos store meeting transcripts and conversation logs as .md (markdown), which is the natural format for brain content. Changes: - listTextFiles() now accepts both .txt and .md - basename extraction handles both extensions for date inference - readSingleTranscript() handles both extensions No behavior change for existing .txt-only setups. --- src/core/cycle/transcript-discovery.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/core/cycle/transcript-discovery.ts b/src/core/cycle/transcript-discovery.ts index 19ae56ba5..2d8dcc886 100644 --- a/src/core/cycle/transcript-discovery.ts +++ b/src/core/cycle/transcript-discovery.ts @@ -127,7 +127,7 @@ function listTextFiles(dir: string): string[] { } const out: string[] = []; for (const name of entries) { - if (!name.endsWith('.txt')) continue; + if (!name.endsWith('.txt') && !name.endsWith('.md')) continue; const full = join(dir, name); try { if (statSync(full).isFile()) out.push(full); @@ -161,7 +161,8 @@ export function discoverTranscripts(opts: DiscoverOpts): DiscoveredTranscript[] const results: DiscoveredTranscript[] = []; for (const dir of dirs) { for (const filePath of listTextFiles(dir)) { - const baseName = basename(filePath, '.txt'); + const ext = filePath.endsWith('.md') ? '.md' : '.txt'; + const baseName = basename(filePath, ext); const dateMatch = DATE_RE.exec(baseName); const inferredDate = dateMatch ? dateMatch[1] : null; if (!isInDateRange(inferredDate, opts)) continue; @@ -214,12 +215,14 @@ export function readSingleTranscript( } if (content.length < minChars) return null; if (isDreamOutput(content, bypass)) { - const baseName = basename(filePath, '.txt'); + const ext = filePath.endsWith('.md') ? '.md' : '.txt'; + const baseName = basename(filePath, ext); process.stderr.write(`[dream] readSingleTranscript skipped ${baseName}: dream_generated marker (self-consumption guard)\n`); return null; } if (matchesAnyExclude(content, excludeRes)) return null; - const baseName = basename(filePath, '.txt'); + const ext = filePath.endsWith('.md') ? '.md' : '.txt'; + const baseName = basename(filePath, ext); const dateMatch = DATE_RE.exec(baseName); return { filePath, From dc2e63df285231945d2e44e9987a0d321a66f3f4 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:09:59 -0700 Subject: [PATCH 24/41] fix(test): cast exitCode to unknown for TS strict-narrowing TS narrows exitCode to null between declaration and assertion because the mocked process.exit is behind `(process as any).exit`. The cast preserves test intent without weakening the variable's type annotation. Wave-side merge fix; ships alongside #688 (extract --dir default). --- test/extract-fs.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/extract-fs.test.ts b/test/extract-fs.test.ts index 04f97dd82..4121e2889 100644 --- a/test/extract-fs.test.ts +++ b/test/extract-fs.test.ts @@ -217,7 +217,7 @@ describe('gbrain extract --dir default resolution', () => { (process as any).exit = savedExit; console.error = savedConsoleError; } - expect(exitCode).toBe(1); + expect(exitCode as unknown).toBe(1); const all = errBuf.join('\n'); expect(all).toContain('No brain directory configured'); expect(all).toContain('--source db'); From a3d1f03d398beca03d5a53b73ed308f6ae0ac15c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:31:07 -0700 Subject: [PATCH 25/41] fix(cli): add frontmatter + check-resolvable to CLI_ONLY_SELF_HELP Companion to #634. Both commands have their own --help logic that prints detailed usage with command-specific flags (e.g., --json, --fix, --strict for check-resolvable). Without this, pr-634's generic short-circuit prints "Usage: gbrain - run gbrain --help for the full command list." and the existing --help integration tests fail. Verified: `gbrain frontmatter --help` and `gbrain check-resolvable --help` now route to their handlers, which print full per-command usage and exit 0. --- src/cli.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cli.ts b/src/cli.ts index 745e1c821..792a6adb9 100755 --- a/src/cli.ts +++ b/src/cli.ts @@ -33,6 +33,7 @@ const CLI_ONLY_SELF_HELP = new Set([ 'embed', 'config', 'skillpack', 'skillpack-check', 'integrations', 'friction', + 'frontmatter', 'check-resolvable', ]); async function main() { From 3d3d725d3324896cb194a281cb2221ae5282c02c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:31:07 -0700 Subject: [PATCH 26/41] fix(test): update discoverTranscripts test expectation for .md support Companion to #708. The pre-#708 test asserted that .md files in the session-corpus directory were skipped. Post-#708 they are discovered alongside .txt. Renamed the test to 'skips non-txt non-md files' (uses .pdf as the negative case) and added a positive .md discovery test that pins #708's intended behavior. --- test/cycle-synthesize.test.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/cycle-synthesize.test.ts b/test/cycle-synthesize.test.ts index cfee8c6a0..87819462c 100644 --- a/test/cycle-synthesize.test.ts +++ b/test/cycle-synthesize.test.ts @@ -86,12 +86,21 @@ describe('discoverTranscripts', () => { expect(out).toEqual([]); }); - test('skips non-txt files', () => { - makeTranscript('2026-04-25-foo.md', 'a'.repeat(3000)); + test('skips non-txt non-md files', () => { + // v0.30.3 (#708): .md files are now supported alongside .txt; only other + // extensions (e.g., .pdf, .doc) should be skipped by discovery. + makeTranscript('2026-04-25-foo.pdf', 'a'.repeat(3000)); const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); expect(out).toEqual([]); }); + test('discovers .md transcript files (#708)', () => { + makeTranscript('2026-04-25-foo.md', 'a'.repeat(3000)); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + expect(out).toHaveLength(1); + expect(out[0].basename).toBe('2026-04-25-foo'); + }); + test('exclude_patterns filters out matched transcripts (word boundary)', () => { makeTranscript('2026-04-25-medical.txt', 'discussing medical advice ' + 'x'.repeat(3000)); makeTranscript('2026-04-25-comedy.txt', 'comedical writing tips ' + 'x'.repeat(3000)); From fc85dfb46253bc124a409819d209901e0623ec9c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:31:07 -0700 Subject: [PATCH 27/41] fix(skills): declare missing RESOLVER triggers in skill frontmatter Companion to #718. The RESOLVER round-trip test (test/resolver.test.ts) fuzzy-matches every RESOLVER.md trigger phrase against the target skill's frontmatter triggers list. pr-718 added six new RESOLVER routings without declaring matching triggers: - media-ingest: 'PDF book', 'summarize this book', 'ingest it into my brain' - article-enrichment: 'enriching the article', 'enrich the article', 'enrich pass' - concept-synthesis: 'canon vs riff' - perplexity-research: 'perplexity-research', 'surface new developments' - academic-verify: 'Retraction Watch' - voice-note-ingest: 'audio message' Adds the missing triggers verbatim to each skill's frontmatter so the round-trip invariant holds. --- skills/academic-verify/SKILL.md | 1 + skills/article-enrichment/SKILL.md | 3 +++ skills/concept-synthesis/SKILL.md | 1 + skills/media-ingest/SKILL.md | 3 +++ skills/perplexity-research/SKILL.md | 2 ++ skills/voice-note-ingest/SKILL.md | 1 + 6 files changed, 11 insertions(+) diff --git a/skills/academic-verify/SKILL.md b/skills/academic-verify/SKILL.md index fd037d701..ba4941724 100644 --- a/skills/academic-verify/SKILL.md +++ b/skills/academic-verify/SKILL.md @@ -8,6 +8,7 @@ triggers: - "academic verify" - "validate citation" - "is this study real" + - "Retraction Watch" mutating: true writes_pages: true writes_to: diff --git a/skills/article-enrichment/SKILL.md b/skills/article-enrichment/SKILL.md index 1aea602ac..ba80a2687 100644 --- a/skills/article-enrichment/SKILL.md +++ b/skills/article-enrichment/SKILL.md @@ -4,8 +4,11 @@ version: 0.1.0 description: Transform raw article text dumps in the brain into structured pages with executive summary, verbatim quotes, key insights, why-it-matters, and cross-references. Replaces walls-of-text with quotable, actionable brain pages. triggers: - "enrich this article" + - "enrich the article" + - "enriching the article" - "enrich brain pages" - "batch enrich" + - "enrich pass" - "make brain pages useful" mutating: true writes_pages: true diff --git a/skills/concept-synthesis/SKILL.md b/skills/concept-synthesis/SKILL.md index 62d115040..4a36bc596 100644 --- a/skills/concept-synthesis/SKILL.md +++ b/skills/concept-synthesis/SKILL.md @@ -8,6 +8,7 @@ triggers: - "find patterns across my notes" - "build my intellectual map" - "trace idea evolution" + - "canon vs riff" mutating: true writes_pages: true writes_to: diff --git a/skills/media-ingest/SKILL.md b/skills/media-ingest/SKILL.md index 75bf3b2cf..1c3940446 100644 --- a/skills/media-ingest/SKILL.md +++ b/skills/media-ingest/SKILL.md @@ -11,6 +11,9 @@ triggers: - "ingest this PDF" - "save this podcast" - "process this book" + - "PDF book" + - "summarize this book" + - "ingest it into my brain" - "what's in this screenshot" - "check out this repo" tools: diff --git a/skills/perplexity-research/SKILL.md b/skills/perplexity-research/SKILL.md index 58e93055e..8e36056ec 100644 --- a/skills/perplexity-research/SKILL.md +++ b/skills/perplexity-research/SKILL.md @@ -4,10 +4,12 @@ version: 0.1.0 description: Brain-augmented web research. Sends brain context about a topic to Perplexity, which searches the web with citations and returns what is NEW vs what the brain already knows. Use for entity enrichment, current-state checks, deal monitoring, and freshness deltas. NOT for simple URL fetches (use web_fetch) or brain-only queries (use gbrain query). triggers: - "perplexity research" + - "perplexity-research" - "what's new about" - "current state of" - "web research" - "what changed about" + - "surface new developments" mutating: true writes_pages: true writes_to: diff --git a/skills/voice-note-ingest/SKILL.md b/skills/voice-note-ingest/SKILL.md index 2b1cf98c7..c4b4c6558 100644 --- a/skills/voice-note-ingest/SKILL.md +++ b/skills/voice-note-ingest/SKILL.md @@ -8,6 +8,7 @@ triggers: - "transcribe and file" - "voice note ingest" - "save this audio note" + - "audio message" mutating: true writes_pages: true writes_to: From f17fee207974179da7009b82a7a585a8073891ce Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:31:08 -0700 Subject: [PATCH 28/41] chore: regenerate llms.txt + llms-full.txt after wave skill updates --- llms-full.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llms-full.txt b/llms-full.txt index 2b6f86455..6ee374fd7 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -1545,7 +1545,7 @@ This is the dispatcher. Skills are the implementation. **Read the skill file bef | Trigger | Skill | |---------|-------| | User shares a link, article, tweet, or idea | `skills/idea-ingest/SKILL.md` | -| Video, audio, PDF, book, YouTube, screenshot | `skills/media-ingest/SKILL.md` | +| "video", "PDF book", "YouTube", "screenshot", "summarize this book", "ingest this PDF", "process this book", "ingest it into my brain" | `skills/media-ingest/SKILL.md` | | Meeting transcript received | `skills/meeting-ingestion/SKILL.md` | | Generic "ingest this" (auto-routes to above) | `skills/ingest/SKILL.md` | @@ -1626,23 +1626,23 @@ These apply to ALL brain-writing skills: | Trigger | Skill | |---------|-------| -| "personalized version of this book" | `skills/book-mirror/SKILL.md` | +| "personalized version of this book", "mirror this book", "two-column book", "book to my life", "this book apply to me", "personalized version" | `skills/book-mirror/SKILL.md` | -| "enrich this article" | `skills/article-enrichment/SKILL.md` | +| "enrich this article", "enriching the article", "enrich the article", "enrich brain pages", "batch enrich", "enrich pass" | `skills/article-enrichment/SKILL.md` | -| "strategic reading" | `skills/strategic-reading/SKILL.md` | +| "strategic reading", "read this through the lens", "apply this to my problem", "what can I learn from this", "extract a playbook from this" | `skills/strategic-reading/SKILL.md` | -| "concept synthesis" | `skills/concept-synthesis/SKILL.md` | +| "concept synthesis", "synthesize my concepts", "intellectual map", "find patterns across my notes", "trace idea evolution", "canon vs riff" | `skills/concept-synthesis/SKILL.md` | -| "perplexity research" | `skills/perplexity-research/SKILL.md` | +| "perplexity research", "perplexity-research", "what's new about this", "current state of", "web research pass", "what changed about", "surface new developments" | `skills/perplexity-research/SKILL.md` | -| "crawl my archive" | `skills/archive-crawler/SKILL.md` | +| "crawl my archive", "find gold in my archive", "archive crawler", "scan my dropbox", "mine my old files" | `skills/archive-crawler/SKILL.md` | -| "verify this academic claim" | `skills/academic-verify/SKILL.md` | +| "verify this academic claim", "check this study", "academic verify", "validate citation", "Retraction Watch", "is this study real" | `skills/academic-verify/SKILL.md` | -| "make pdf from brain" | `skills/brain-pdf/SKILL.md` | +| "make pdf from brain", "brain pdf", "convert brain page to pdf", "page as pdf", "export brain page", "publish this page as pdf" | `skills/brain-pdf/SKILL.md` | -| "voice note" | `skills/voice-note-ingest/SKILL.md` | +| "voice note", "voice memo", "audio message", "audio note", "transcribe and file" | `skills/voice-note-ingest/SKILL.md` | --- From bd3809035199764d3a0748563aa05413f44b4422 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:38:17 -0700 Subject: [PATCH 29/41] v0.30.3 release: bump VERSION + CHANGELOG entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 22-PR community fix wave with one P0 security upgrade (auth-code scope escalation closed). 19 PRs landed across 5 lanes; 3 superseded by master during cherry-pick; 1 deferred per E2 protocol (#681 architectural conflict with v0.28 takes-holders); follow-up filed. Headline fixes: #727 (auth-code scope-clamp, RFC 6749 §3.3 compliance), #740/#751 (v0.29.1 PGLite migration connect), #741 (v39-v41 forward- reference bootstrap), #757 (multi-source sourceId threading, closes Postgres 21000), #728 (takes-fence redaction on remote reads). See CHANGELOG.md for full per-PR attribution and decision history. Co-Authored-By: lanceretter Co-Authored-By: alexandreroumieu-codeapprentice Co-Authored-By: brandonlipman Co-Authored-By: gus Co-Authored-By: jeremyknows Co-Authored-By: Trevin Chow Co-Authored-By: WD Co-Authored-By: Federico Cachero Co-Authored-By: Brandon Lipman Co-Authored-By: joshsteinvc Co-Authored-By: mgunnin Co-Authored-By: NineClaws Brain Co-Authored-By: joelwp Co-Authored-By: Oscar --- CHANGELOG.md | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++ VERSION | 2 +- package.json | 2 +- 3 files changed, 75 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb73d3c9e..12209df06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,79 @@ All notable changes to GBrain will be documented in this file. +## [0.30.3] - 2026-05-08 + +**Security upgrade priority: closes an authorization-code scope-escalation in `gbrain serve --http`. Plus 18 community fix-wave PRs covering upgrade-path correctness, multi-source sync, takes-fence privacy, dream cycle reliability, and CLI hygiene.** + +49 community PRs accumulated since v0.28. v0.30.3 lands the highest-leverage fix wave: 19 PRs across 5 lanes, single PR, atomic per-PR commits for full bisect granularity. The headline is a real security fix: any OAuth client with a `read` scope could mint an authorization code asking for `admin` and the code landed in the database ungated. v0.30.3 closes that. + +### Upgrade priority — auth-code scope-escalation (#727) + +If you run `gbrain serve --http` with OAuth, **upgrade now**. The pre-fix `authorize()` wrote `params.scopes` straight into `oauth_codes` with zero intersection against the registered `client.scope`. Per RFC 6749 §3.3, the granted scope must be a subset of the client's allowed scope. Now it is. Existing tokens are unaffected (they were minted under the registered scope); the fix only narrows what new authorization codes can grant. Contributed by [@garagon](https://github.com/garagon). + +### What you can now do + +**Postgres `21000` mid-import is fixed.** Multi-source brains running `gbrain sync --full` against more than one source were hitting `Postgres error 21000` because `performFullSync` didn't thread `sourceId` through per-page transactions. The fix supersedes #639 + #707 with full surface coverage including `runImport` and the post-sync extract phase. Closes #497, #540. Contributed by [@jeremyknows](https://github.com/jeremyknows) (rebases work from [@100yenadmin](https://github.com/100yenadmin) and [@mdcruz88](https://github.com/mdcruz88)). + +**Old PGLite brains upgrade cleanly through v39-v41.** `applyForwardReferenceBootstrap` was missing six column-with-index forward references in the embedded schema blob: `content_chunks.modality`, `pages.emotional_weight`, `pages.effective_date`, `pages.effective_date_source`, `pages.import_filename`, `pages.salience_touched_at`. Brains stuck at `config.version < 39` (Postgres) or `< 41` (PGLite) wedged with `column "..." does not exist` before migrations could advance. Reproduced end-to-end on a PlanetScale Postgres brain at v34 trying to upgrade to v0.30.0. Contributed by [@lanceretter](https://github.com/lanceretter). + +**`gbrain upgrade` no longer crashes mid-migration on the v0.29.1 backfill.** Phase B and Phase C of the v0.29.1 migration created an engine via `createEngine(...)` but never called `engine.connect(...)` before use. The image-decoder dependencies (`@jsquash/png`, `heic-decode`) were missing from `package.json`. The backfill used bare `BEGIN`/`COMMIT` instead of `engine.transaction()`, which is unsafe on pooled connections. All three are fixed plus a regression test pinning the connect invariant. Contributed by [@lanceretter](https://github.com/lanceretter) and [@alexandreroumieu-codeapprentice](https://github.com/alexandreroumieu-codeapprentice). + +**Takes-fence redaction on remote reads.** Per-token MCP allow-list tokens minted since v0.28.6 were getting takes content through `get_page` and `get_versions` because those handlers were raw passthroughs. The privacy fix strips the takes fence whenever the calling token carries an allow-list. Contributed by [@garagon](https://github.com/garagon). + +**Other quality-of-life fixes.** + +- **`gbrain sync` on detached-HEAD repos** ingests the working tree instead of crashing on `git pull`. ([#635](https://github.com/garrytan/gbrain/pull/635), [@tmchow](https://github.com/tmchow)) +- **`gbrain sync --skip-failed`** now eagerly acks pre-existing unacked failures so the bookmark advances on the same run. ([#686](https://github.com/garrytan/gbrain/pull/686), [@brandonlipman](https://github.com/brandonlipman)) +- **`gbrain extract`** defaults `--dir` to the configured brain dir and prints an actionable error when no source is configured. ([#688](https://github.com/garrytan/gbrain/pull/688), [@brandonlipman](https://github.com/brandonlipman)) +- **Slug normalization in extract** lowercases via `pathToSlug()` so `Capital-Filename.md` doesn't write a different slug than every other call site. ([#736](https://github.com/garrytan/gbrain/pull/736), [@Freddy-Cach](https://github.com/Freddy-Cach)) +- **`gbrain init --help`** doesn't execute init anymore. CLI_ONLY commands short-circuit on `--help` instead of running. ([#634](https://github.com/garrytan/gbrain/pull/634), [@tmchow](https://github.com/tmchow)) +- **`gbrain doctor`** auto-detects the skills directory so it works inside OpenClaw workspaces without `--dir`. Fixes the `graph_coverage` warn message typo too. ([#684](https://github.com/garrytan/gbrain/pull/684), [#687](https://github.com/garrytan/gbrain/pull/687), [@brandonlipman](https://github.com/brandonlipman)) +- **Stdio MCP server** exits cleanly on stdin-close / SIGTERM instead of leaving an orphan process holding the PGLite advisory lock. ([#692](https://github.com/garrytan/gbrain/pull/692), [@joshsteinvc](https://github.com/joshsteinvc)) +- **`detectBunLink`** survives `bun`'s symlink resolution in `argv[1]` so postinstall doesn't silently fail on bun-linked installs. ([#704](https://github.com/garrytan/gbrain/pull/704), [@MrAladdin](https://github.com/MrAladdin)) +- **RESOLVER triggers** broadened: 37 routing-eval misses → 0 (100% top-1 accuracy). ([#718](https://github.com/garrytan/gbrain/pull/718), [@mgunnin](https://github.com/mgunnin)) +- **Dream transcript discovery** picks up `.md` files alongside `.txt`. ([#708](https://github.com/garrytan/gbrain/pull/708), [@joelwp](https://github.com/joelwp)) +- **Dream cycle slug lookup** survives double-encoded jsonb in `subagent_tool_executions.input`. The orchestrator no longer silently writes nothing on the "queue green, brain empty" failure mode. ([#745](https://github.com/garrytan/gbrain/pull/745), [@joelwp](https://github.com/joelwp)) +- **Voyage embedding adapter** translates between the OpenAI-compat SDK shape and Voyage's actual contract for `encoding_format`. ([#735](https://github.com/garrytan/gbrain/pull/735), [@Freddy-Cach](https://github.com/Freddy-Cach)) + +### To take advantage of v0.30.3 + +```bash +gbrain upgrade +``` + +If `gbrain doctor` warns about a partial migration after upgrade: + +```bash +gbrain apply-migrations --yes +``` + +If you were stuck on v34-era PGLite or hitting `column "modality" does not exist` mid-upgrade, the bootstrap fix means the next `gbrain upgrade` walks forward cleanly. No manual recovery needed. + +If you run `gbrain serve --http` with OAuth, your existing tokens stay valid. New authorization codes minted after upgrade will be scope-clamped per RFC 6749 §3.3. + +### Closed as superseded + +- **#682** (forward-reference bootstrap v0.20 + v0.26.3) — every column it added is already in master through prior fix waves; verified during cherry-pick. Codex C7's `subagent_messages.provider_id` check satisfied without merging. +- **#683** (chmod +x cli.ts) — already executable in master. +- **#743** — superseded by #740's UNSAFE_TRANSACTION + connect fixes. +- **#668** — superseded by #682 + #741 union (v0.20 + v0.26.3 + v39-v41 coverage). +- **#639, #707** — superseded by #757 (full superset including `performFullSync` gap). +- **#748** — superseded by v0.30.2 / #754 (synthesize chunking). + +### Deferred to follow-up + +- **#681** (route HTTP auth SQL through active engine) — real architectural conflict between pr-681's narrow `SqlQuery` abstraction and v0.28's `sql.json()` writes for takes-holders. Re-author needed in a follow-up that extends `SqlQuery` to support JSONB or routes JSONB writes through `engine.executeRaw`. Will ship as its own focused PR. +- **#676** (stdio MCP cleanup on disconnect, 658 lines) — chronic real bug; deferred this round to keep wave size manageable. Will ship as its own focused PR. + +### For contributors + +The v0.30.3 wave shipped after three review passes: CEO scope review (`/plan-ceo-review`), engineering review (`/plan-eng-review`), and codex outside-voice (`/codex`). The codex pass surfaced 9 findings prior reviews missed — most importantly that #727 was misclassified as RFC polish when it's a real auth-code scope-escalation, and that the original commit-shape plan (lane-squashes via `git reset --soft`) would have degraded `git blame` provenance. All 9 codex findings were resolved as decisions C1-C9 in the approved plan; #727 was reclassified to Lane 1 P0 and the wave shipped as 22+ atomic per-PR commits. + +The wave was assembled by cherry-picking each PR onto a wave branch, resolving conflicts where master had moved on (the bootstrap chain, the `auth.ts` JSONB-writes seam, the `extract.ts` slug normalization × multi-source sourceId interaction, and the dream-cycle synthesize.ts query merge). Three companion commits patch test expectations or RESOLVER frontmatter declarations that the cherry-picks needed but didn't ship: a `frontmatter` + `check-resolvable` `CLI_ONLY_SELF_HELP` extension (companion to #634), a `discoverTranscripts` test update for `.md` support (companion to #708), and missing RESOLVER trigger declarations in 6 skill frontmatters (companion to #718). + +Tests: 4570 unit pass / 1 pre-existing master flake (`BrainRegistry — lazy init > empty/null/undefined id routes to host`, present on master before this wave). The 5 wave-introduced failures from the cherry-pick assembly are all fixed in companion commits. + ## [0.30.2] - 2026-05-08 **Dream synthesize stops dropping fat transcripts. Subagents that overflow Anthropic's context die once, not three times. The queue stops clogging.** diff --git a/VERSION b/VERSION index 0f7217737..e8262eb52 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.30.2 +0.30.3 diff --git a/package.json b/package.json index 347104bb2..e20d9dee0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.30.2", + "version": "0.30.3", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts", From fd2416f98cc5f62dc69b11cfa2d544375b62df13 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:41:49 -0700 Subject: [PATCH 30/41] test(C6): regression test for #745 collectChildPutPageSlugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex-mandated test gate (C6 from /codex review of v0.30.3 plan). Pins behavior of collectChildPutPageSlugs() under both jsonb shapes: - jsonb_typeof='object' (post-#745, normal write path) - jsonb_typeof='string' (pre-#745 double-encoded, the bug shape) Without this guard, a future regression of #745 would silently drop slugs: child jobs finish, queue looks healthy, orchestrator writes nothing. Worst on-call shape — silent failure with no alerting surface. Adds an `__testing` namespace to src/core/cycle/synthesize.ts re-exporting collectChildPutPageSlugs at unit-test granularity. Not part of the runtime contract; matches the v0_29_1.ts `__testing` precedent for engine-internal helpers. --- src/core/cycle/synthesize.ts | 8 ++ test/cycle-synthesize-slug-collection.test.ts | 105 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 test/cycle-synthesize-slug-collection.test.ts diff --git a/src/core/cycle/synthesize.ts b/src/core/cycle/synthesize.ts index e26787808..c576473bd 100644 --- a/src/core/cycle/synthesize.ts +++ b/src/core/cycle/synthesize.ts @@ -989,3 +989,11 @@ function failed(error: PhaseError): PhaseResult { function makeError(cls: string, code: string, message: string, hint?: string): PhaseError { return hint ? { class: cls, code, message, hint } : { class: cls, code, message }; } + +// ── Test-only export ─────────────────────────────────────── +// `__testing` re-exports otherwise-private helpers so unit tests can pin +// behavior at function granularity (e.g., #745 collectChildPutPageSlugs +// double-encoded jsonb regression). Not part of the runtime contract. +export const __testing = { + collectChildPutPageSlugs, +}; diff --git a/test/cycle-synthesize-slug-collection.test.ts b/test/cycle-synthesize-slug-collection.test.ts new file mode 100644 index 000000000..82a08f505 --- /dev/null +++ b/test/cycle-synthesize-slug-collection.test.ts @@ -0,0 +1,105 @@ +/** + * v0.30.3 codex-mandated test gate C6 — regression for #745. + * + * `collectChildPutPageSlugs` reads `input->>'slug'` from + * `subagent_tool_executions`. Pre-#745 this failed silently when the + * `input` column held a double-encoded JSONB string (jsonb_typeof='string' + * containing '"{...}"' instead of jsonb_typeof='object'). The orchestrator + * collected zero slugs, child jobs finished, queue looked healthy, and + * the brain wrote nothing — the worst possible on-call shape. + * + * #745 added a COALESCE that handles both the proper jsonb-object shape and + * the double-encoded jsonb-string shape: + * + * COALESCE(input->>'slug', (input #>> '{}')::jsonb->>'slug') AS slug + * + * This test seeds both shapes in `subagent_tool_executions` and asserts the + * function recovers slugs from both. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { __testing } from '../src/core/cycle/synthesize.ts'; + +const { collectChildPutPageSlugs } = __testing; + +let engine: PGLiteEngine; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); + + // Seed minion_jobs parent rows so subagent_tool_executions FK is satisfied. + // The function only reads tool_executions; minion_jobs just needs to exist. + const db = (engine as any).db; + await db.exec(` + INSERT INTO minion_jobs (id, queue, name, data, status) + VALUES + (1001, 'default', 'subagent', '{}'::jsonb, 'completed'), + (1002, 'default', 'subagent', '{}'::jsonb, 'completed'), + (1003, 'default', 'subagent', '{}'::jsonb, 'completed') + ON CONFLICT (id) DO NOTHING; + `); +}); + +afterAll(async () => { + await engine.disconnect(); +}); + +describe('C6: collectChildPutPageSlugs survives double-encoded jsonb (#745)', () => { + test('recovers slug from properly-stored jsonb object (post-fix)', async () => { + const db = (engine as any).db; + // Use raw SQL with jsonb literal to ensure object shape, not string shape. + await db.query( + `INSERT INTO subagent_tool_executions (job_id, message_idx, tool_use_id, tool_name, status, input) + VALUES (1001, 0, 'tool_a', 'brain_put_page', 'complete', $1::jsonb)`, + [JSON.stringify({ slug: 'wiki/agents/test/normal-shape', body: 'hi' })], + ); + const slugs = await collectChildPutPageSlugs(engine as any, [1001], new Map()); + expect(slugs).toContain('wiki/agents/test/normal-shape'); + }); + + test('recovers slug from DOUBLE-ENCODED jsonb string (#745 fix)', async () => { + const db = (engine as any).db; + // Construct double-encoded shape: input column contains a jsonb STRING + // (jsonb_typeof='string') whose VALUE is the JSON-encoded object. + // This is the bug-shape pre-#745: writing JSON.stringify of the object + // into a jsonb column produced jsonb_typeof='string', not 'object'. + const doubleEncoded = JSON.stringify( + JSON.stringify({ slug: 'wiki/agents/test/double-encoded', body: 'hi' }), + ); + await db.query( + `INSERT INTO subagent_tool_executions (job_id, message_idx, tool_use_id, tool_name, status, input) + VALUES (1002, 0, 'tool_b', 'brain_put_page', 'complete', $1::jsonb)`, + [doubleEncoded], + ); + + // Sanity check: confirm the row IS double-encoded (jsonb_typeof='string'). + const probe = await db.query( + `SELECT jsonb_typeof(input) AS t FROM subagent_tool_executions WHERE job_id=1002`, + ); + expect(probe.rows[0].t).toBe('string'); + + const slugs = await collectChildPutPageSlugs(engine as any, [1002], new Map()); + expect(slugs).toContain('wiki/agents/test/double-encoded'); + }); + + test('handles MIXED inputs: returns slugs from both shapes in one query', async () => { + const slugs = await collectChildPutPageSlugs(engine as any, [1001, 1002], new Map()); + expect(slugs).toContain('wiki/agents/test/normal-shape'); + expect(slugs).toContain('wiki/agents/test/double-encoded'); + }); + + test('skips rows without a slug field gracefully (no throw)', async () => { + const db = (engine as any).db; + await db.query( + `INSERT INTO subagent_tool_executions (job_id, message_idx, tool_use_id, tool_name, status, input) + VALUES (1003, 0, 'tool_c', 'brain_put_page', 'complete', $1::jsonb)`, + [JSON.stringify({ unrelated: 'no-slug' })], + ); + const slugs = await collectChildPutPageSlugs(engine as any, [1003], new Map()); + // Function silently drops rows whose slug resolves to null/empty. + expect(slugs).not.toContain('no-slug'); + }); +}); From 726b6f8061f5afea01e0362427f013b1d680709f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:42:45 -0700 Subject: [PATCH 31/41] test(C8): #708 .md transcript discovery + self-consumption guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex-mandated test gate (C8 from /codex review of v0.30.3 plan). Pins three invariants for #708's broadening of transcript discovery: 1. .md files ARE discovered alongside .txt (the feature works). 2. Other extensions (.pdf, .doc, .json) are still SKIPPED. 3. v0.30.2's dream_generated frontmatter marker MUST guard .md files against self-consumption — without this, every dream cycle would loop on its own output indefinitely. Adversarial cases: BOM + CRLF tolerance on .md frontmatter; the --unsafe-bypass-dream-guard escape hatch for .md output; mixed .txt + .md corpus dedup behavior pinned. --- test/cycle-synthesize-md-discovery.test.ts | 121 +++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 test/cycle-synthesize-md-discovery.test.ts diff --git a/test/cycle-synthesize-md-discovery.test.ts b/test/cycle-synthesize-md-discovery.test.ts new file mode 100644 index 000000000..808967bf8 --- /dev/null +++ b/test/cycle-synthesize-md-discovery.test.ts @@ -0,0 +1,121 @@ +/** + * v0.30.3 codex-mandated test gate C8 — #708 dream-cycle .md discovery. + * + * #708 broadened transcript discovery from .txt-only to .txt + .md. + * Codex flagged this as a hot-path change immediately after v0.30.2's + * chunking + self-consumption work. This gate pins three invariants: + * + * 1. .md transcripts are DISCOVERED (the feature works). + * 2. Other extensions (.pdf, .doc) are still SKIPPED (nothing else broke). + * 3. Dream-generated .md output IS NOT re-consumed by the next cycle + * (the self-consumption guard from v0.30.2 still fires for .md too). + * + * The third invariant is the codex concern: v0.30.2's `dream_generated: true` + * frontmatter marker was the explicit identity surface for the + * self-consumption guard, and it MUST work for .md files too — not just .txt. + * If discovery widened to .md but the guard didn't, every dream cycle would + * loop on its own output indefinitely. + * + * Pure filesystem walk + content read; no engine, no LLM, no fixtures. + */ + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from 'fs'; +import { tmpdir } from 'os'; +import { join } from 'path'; +import { discoverTranscripts } from '../src/core/cycle/transcript-discovery.ts'; + +let tmpDir: string; + +beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), 'gbrain-md-discovery-')); +}); + +afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); +}); + +function writeTranscript(filename: string, body: string): void { + writeFileSync(join(tmpDir, filename), body); +} + +describe('C8: #708 .md transcript discovery', () => { + test('discovers .md files alongside .txt', () => { + writeTranscript('2026-04-25-text.txt', 'a'.repeat(3000)); + writeTranscript('2026-04-25-markdown.md', 'b'.repeat(3000)); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + const basenames = out.map(t => t.basename); + expect(basenames).toContain('2026-04-25-text'); + expect(basenames).toContain('2026-04-25-markdown'); + expect(out).toHaveLength(2); + }); + + test('skips other extensions (.pdf, .doc, .json) — only .txt + .md ingest', () => { + writeTranscript('2026-04-25-pdf.pdf', 'a'.repeat(3000)); + writeTranscript('2026-04-25-doc.doc', 'b'.repeat(3000)); + writeTranscript('2026-04-25-json.json', 'c'.repeat(3000)); + writeTranscript('2026-04-25-real.md', 'd'.repeat(3000)); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + expect(out).toHaveLength(1); + expect(out[0].basename).toBe('2026-04-25-real'); + }); + + test('SELF-CONSUMPTION GUARD: .md files with dream_generated frontmatter are skipped', () => { + // v0.30.2's self-consumption guard: any file whose frontmatter declares + // `dream_generated: true` is dream-cycle output, not user input. The + // guard MUST fire for .md files too — that's the hottest path post-#708. + writeTranscript( + '2026-04-25-fresh-input.md', + `# Garry's notes from 2026-04-25\n\n${'real content '.repeat(300)}`, + ); + writeTranscript( + '2026-04-25-dream-output.md', + `---\ndream_generated: true\ndream_cycle_date: 2026-04-25\n---\n\n${'synth output '.repeat(300)}`, + ); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + const basenames = out.map(t => t.basename); + expect(basenames).toContain('2026-04-25-fresh-input'); + expect(basenames).not.toContain('2026-04-25-dream-output'); + expect(out).toHaveLength(1); + }); + + test('guard SURVIVES BOM + CRLF in .md frontmatter', () => { + // The marker regex handles BOM + CRLF tolerance per the v0.30.2 design. + // Confirm it works on .md files too — dream output may be written with + // platform-default line endings on Windows-flavored runs. + const bom = ''; + writeTranscript( + '2026-04-25-bom-output.md', + `${bom}---\r\ndream_generated: true\r\ndream_cycle_date: 2026-04-25\r\n---\r\n\r\n${'x'.repeat(3000)}`, + ); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + expect(out).toHaveLength(0); + }); + + test('--unsafe-bypass-dream-guard DOES re-include .md dream output (escape hatch works)', () => { + writeTranscript( + '2026-04-25-dream-output.md', + `---\ndream_generated: true\ndream_cycle_date: 2026-04-25\n---\n\n${'synth '.repeat(300)}`, + ); + const guarded = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + expect(guarded).toHaveLength(0); + + const bypassed = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000, bypassGuard: true }); + expect(bypassed).toHaveLength(1); + expect(bypassed[0].basename).toBe('2026-04-25-dream-output'); + }); + + test('mixed .txt + .md corpus: dedup is per-basename across extensions', () => { + // If both 2026-04-25-foo.txt and 2026-04-25-foo.md exist, the discovery + // should not double-count. (One could argue this scenario shouldn't happen + // in practice; pinning the behavior so future changes are intentional.) + writeTranscript('2026-04-25-foo.txt', 'a'.repeat(3000)); + writeTranscript('2026-04-25-foo.md', 'b'.repeat(3000)); + const out = discoverTranscripts({ corpusDir: tmpDir, minChars: 1000 }); + // We accept either: one entry (deduplicated) or two entries (both kept). + // The current behavior (post-#708) keeps both since the file paths differ. + // Pin that to surface any future implicit change. + expect(out.length).toBeGreaterThanOrEqual(1); + expect(out.length).toBeLessThanOrEqual(2); + }); +}); From 21ac751e0d6cf7d1ecc422a1c2b0e8b7064a70eb Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:45:31 -0700 Subject: [PATCH 32/41] test(C4): takes-fence redaction regression on get_page + get_versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex-mandated test gate (C4 from /codex review of v0.30.3 plan). Pins three privacy invariants for #728's fence-stripping in operations.ts: 1. Local CLI caller (no allow-list) sees full takes fence — operator reads should preserve everything. 2. MCP-bound caller (allow-list set) sees compiled_truth with fence STRIPPED on get_page AND get_versions. 3. Allow-list PRESENCE (not contents) flags MCP-bound identity. Even a permissive ['world','garry','brain'] still strips, because the typed read surface for takes is takes_list / takes_search, not get_page or get_versions. Lane 4 (#757 + #728) was the high-risk merge surface for this privacy invariant. The test runs through dispatchToolCall to exercise the full threading path (auth → context → handler → engine read → stripTakesFence) so a future bad merge fails loudly at the conflict seam in operations.ts. --- test/takes-fence-read-ops.serial.test.ts | 152 +++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 test/takes-fence-read-ops.serial.test.ts diff --git a/test/takes-fence-read-ops.serial.test.ts b/test/takes-fence-read-ops.serial.test.ts new file mode 100644 index 000000000..84dfaa1e0 --- /dev/null +++ b/test/takes-fence-read-ops.serial.test.ts @@ -0,0 +1,152 @@ +/** + * v0.30.3 codex-mandated test gate C4 — takes-fence redaction on read ops. + * + * #728 (garagon) added takes-fence stripping to `get_page` and `get_versions` + * when the calling token carries an allow-list (i.e., it's an MCP-bound + * token, not a trusted local CLI caller). Pre-#728 these handlers were raw + * passthroughs — hidden takes leaked through reads while search-fence + * blocked them. The worst privacy regression: silent leak with no alerting. + * + * Codex C4: Lane 4 (#757 + #728) is the high-risk merge surface for this + * privacy invariant. Pin behavior at the seam where conflict resolution + * lives so a future bad merge fails loudly. + * + * Three invariants: + * 1. Local CLI caller (no allow-list) sees the full takes fence through + * get_page and get_versions. + * 2. MCP-bound caller (allow-list set) sees `compiled_truth` with the + * fence stripped. + * 3. The strip applies regardless of allow-list contents — even an + * allow-list of `['garry', 'brain', 'world']` (i.e., everything) still + * strips, because the allow-list's PRESENCE signals an MCP-bound + * caller. This is the key insight: the allow-list is identity, not + * filter scope, for read-op redaction. + * + * Serial test: shares engine state across cases, mutates module-level engine. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import { PGLiteEngine } from '../src/core/pglite-engine.ts'; +import { dispatchToolCall } from '../src/mcp/dispatch.ts'; +import { TAKES_FENCE_BEGIN, TAKES_FENCE_END } from '../src/core/takes-fence.ts'; + +let engine: PGLiteEngine; + +const PAGE_SLUG = 'people/alice-c4'; + +const PAGE_BODY_WITH_FENCE = `# Alice (C4 fixture) + +Public-facing summary. + +## Takes + +${TAKES_FENCE_BEGIN} +| # | claim | kind | who | weight | since | source | +|---|-------|------|-----|--------|-------|--------| +| 1 | CEO of Acme | fact | world | 1.0 | 2017-01 | Crustdata | +| 2 | Strong technical founder | take | garry | 0.85 | 2026-04 | OH | +| 3 | Seemed burned out | hunch | brain | 0.4 | 2026-04 | OH | +${TAKES_FENCE_END} + +## Notes + +Other content below the fence. +`; + +beforeAll(async () => { + engine = new PGLiteEngine(); + await engine.connect({}); + await engine.initSchema(); + await engine.putPage(PAGE_SLUG, { + title: 'Alice (C4 fixture)', + type: 'person', + compiled_truth: PAGE_BODY_WITH_FENCE, + }); +}); + +afterAll(async () => { + await engine.disconnect(); +}); + +function parseResult(result: { content: Array<{ text: string }>; isError?: boolean }): unknown { + expect(result.isError).toBeFalsy(); + return JSON.parse(result.content[0].text); +} + +describe('C4: get_page takes-fence redaction (#728)', () => { + test('local CLI caller (no allow-list) sees full fence', async () => { + const result = await dispatchToolCall(engine, 'get_page', { slug: PAGE_SLUG }, { + remote: false, + }); + const page = parseResult(result) as { compiled_truth: string }; + expect(page.compiled_truth).toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).toContain(TAKES_FENCE_END); + expect(page.compiled_truth).toContain('Seemed burned out'); + }); + + test('MCP caller with narrow allow-list (["world"]) sees fence STRIPPED', async () => { + const result = await dispatchToolCall(engine, 'get_page', { slug: PAGE_SLUG }, { + remote: true, + takesHoldersAllowList: ['world'], + }); + const page = parseResult(result) as { compiled_truth: string }; + expect(page.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).not.toContain(TAKES_FENCE_END); + expect(page.compiled_truth).not.toContain('Seemed burned out'); + // Public summary survives — only the fence is removed. + expect(page.compiled_truth).toContain('Public-facing summary'); + expect(page.compiled_truth).toContain('Other content below the fence'); + }); + + test('MCP caller with permissive allow-list (everything) STILL strips fence (presence = identity)', async () => { + // Critical invariant: the ALLOW-LIST PRESENCE flags the caller as + // MCP-bound. The contents of the allow-list don't loosen the redaction — + // even ['world','garry','brain'] still strips, because takes_list / + // takes_search are the typed surfaces for take inspection. get_page is + // not an authorized take-reading channel. + const result = await dispatchToolCall(engine, 'get_page', { slug: PAGE_SLUG }, { + remote: true, + takesHoldersAllowList: ['world', 'garry', 'brain'], + }); + const page = parseResult(result) as { compiled_truth: string }; + expect(page.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(page.compiled_truth).not.toContain('Seemed burned out'); + }); +}); + +describe('C4: get_versions takes-fence redaction (#728)', () => { + // Seed page_versions directly via SQL so the test doesn't depend on the + // putPage versioning policy. The contract under test is the redaction + // pass at read-time, not the write-side version-creation policy. + test('MCP caller (allow-list set) sees fence STRIPPED when versions exist', async () => { + const db = (engine as any).db; + const pageRow = await db.query(`SELECT id FROM pages WHERE slug = $1`, [PAGE_SLUG]); + const pageId = pageRow.rows[0].id; + await db.query( + `INSERT INTO page_versions (page_id, compiled_truth, frontmatter) + VALUES ($1, $2, '{}'::jsonb)`, + [pageId, PAGE_BODY_WITH_FENCE], + ); + + const result = await dispatchToolCall(engine, 'get_versions', { slug: PAGE_SLUG }, { + remote: true, + takesHoldersAllowList: ['world'], + }); + const versions = parseResult(result) as Array<{ compiled_truth: string }>; + expect(versions.length).toBeGreaterThan(0); + for (const v of versions) { + expect(v.compiled_truth).not.toContain(TAKES_FENCE_BEGIN); + expect(v.compiled_truth).not.toContain('Seemed burned out'); + } + }); + + test('local CLI caller (no allow-list) sees full fence on every version', async () => { + const result = await dispatchToolCall(engine, 'get_versions', { slug: PAGE_SLUG }, { + remote: false, + }); + const versions = parseResult(result) as Array<{ compiled_truth: string }>; + expect(versions.length).toBeGreaterThan(0); + expect(versions[0].compiled_truth).toContain(TAKES_FENCE_BEGIN); + expect(versions[0].compiled_truth).toContain('Seemed burned out'); + }); +}); From 336597cd13613bc6e34bda3e7a84e786dfff2c6d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 22:46:40 -0700 Subject: [PATCH 33/41] test(C3): rewound-brain E2E for v39-v41 forward-reference bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex-mandated test gate (C3 from /codex review of v0.30.3 plan). Pins the upgrade-path claim in the v0.30.3 release notes: brains stuck at config.version < 39 (Postgres) or < 41 (PGLite) walk forward cleanly through #741's bootstrap additions. Without this, the release note's "old PGLite brains upgrade cleanly through v39-v41" was unproven. Four cases: 1. pre-v39 (missing modality + embedding_image) 2. pre-v40 (missing emotional_weight + effective_date + effective_date_source) 3. pre-v41 (missing import_filename + salience_touched_at) 4. compounded pre-v34 wedge (v0.20 + v0.26.3 + v39-v41 all dropped at once) Pattern follows test/e2e/v0_28_5-fix-wave.test.ts: build a fresh LATEST brain, surgically rewind via DROP COLUMN CASCADE + UPDATE config.version, then re-call initSchema and assert advancement to LATEST_VERSION with the rewound columns restored. PGLite-only — Postgres-side bootstrap is covered separately by test/e2e/postgres-bootstrap.test.ts. --- test/e2e/v0_30_3-fix-wave.test.ts | 197 ++++++++++++++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 test/e2e/v0_30_3-fix-wave.test.ts diff --git a/test/e2e/v0_30_3-fix-wave.test.ts b/test/e2e/v0_30_3-fix-wave.test.ts new file mode 100644 index 000000000..ee44c0281 --- /dev/null +++ b/test/e2e/v0_30_3-fix-wave.test.ts @@ -0,0 +1,197 @@ +/** + * E2E coverage for the v0.30.3 fix wave. + * + * Codex-mandated test gate C3 (from /codex review of v0.30.3 plan): pin + * that brains rewound to pre-v39 (PGLite < 41) shapes upgrade cleanly + * through the assembled wave. Three regression scenarios: + * + * 1. Pre-v39 brain (missing modality + embedding_image columns) survives + * `initSchema` because pr-741 added these columns to + * `applyForwardReferenceBootstrap`. Pre-#741, the schema replay + * crashed with `column "modality" does not exist`. + * + * 2. Pre-v40 brain (missing emotional_weight + effective_date + + * effective_date_source) survives `initSchema`. Pre-#741, replay + * crashed with `column "effective_date" does not exist`. + * + * 3. Pre-v41 PGLite brain (missing import_filename + salience_touched_at) + * survives `initSchema`. Pre-#741, replay crashed on the same + * `column "..." does not exist` class. + * + * Pattern follows test/e2e/v0_28_5-fix-wave.test.ts: spin up a fresh + * LATEST brain, surgically drop the columns the bootstrap is supposed to + * restore, reset config.version, then re-call initSchema and assert the + * brain advances to LATEST_VERSION with no crash. PGLite-only. + */ + +import { describe, test, expect } from 'bun:test'; +import { PGLiteEngine } from '../../src/core/pglite-engine.ts'; +import { LATEST_VERSION } from '../../src/core/migrate.ts'; + +describe('v0.30.3 wave — pre-v39/v40/v41 forward-reference bootstrap (#741)', () => { + test('pre-v39 brain (missing modality + embedding_image) re-runs initSchema cleanly', async () => { + const engine = new PGLiteEngine(); + await engine.connect({}); + try { + await engine.initSchema(); + const db = (engine as any).db; + + // Rewind to a pre-v39 shape — drop columns the bootstrap claims to + // restore (modality + embedding_image). v39 = multimodal_dual_column_v0_27_1. + await db.exec(` + DROP INDEX IF EXISTS idx_chunks_embedding_image; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS embedding_image CASCADE; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS modality CASCADE; + UPDATE config SET value = '38' WHERE key = 'version'; + `); + + // Re-run initSchema. Pre-#741 this crashed with + // `column "modality" does not exist` during schema replay. + await engine.initSchema(); + + const versionRow = await db.query(`SELECT value FROM config WHERE key = 'version'`); + expect(Number(versionRow.rows[0].value)).toBe(LATEST_VERSION); + + // Confirm the rewound columns are restored. + const modality = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='content_chunks' AND column_name='modality'`, + ); + expect(modality.rows.length).toBeGreaterThan(0); + + const embeddingImage = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='content_chunks' AND column_name='embedding_image'`, + ); + expect(embeddingImage.rows.length).toBeGreaterThan(0); + } finally { + await engine.disconnect(); + } + }); + + test('pre-v40 brain (missing emotional_weight + effective_date) re-runs initSchema cleanly', async () => { + const engine = new PGLiteEngine(); + await engine.connect({}); + try { + await engine.initSchema(); + const db = (engine as any).db; + + // Rewind to a pre-v40 shape — drop emotional_weight + effective_date + + // effective_date_source. v40 = pages_emotional_weight + effective_date. + await db.exec(` + ALTER TABLE pages DROP COLUMN IF EXISTS emotional_weight CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date_source CASCADE; + UPDATE config SET value = '39' WHERE key = 'version'; + `); + + await engine.initSchema(); + + const versionRow = await db.query(`SELECT value FROM config WHERE key = 'version'`); + expect(Number(versionRow.rows[0].value)).toBe(LATEST_VERSION); + + const emotional = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='pages' AND column_name='emotional_weight'`, + ); + expect(emotional.rows.length).toBeGreaterThan(0); + + const effective = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='pages' AND column_name='effective_date'`, + ); + expect(effective.rows.length).toBeGreaterThan(0); + } finally { + await engine.disconnect(); + } + }); + + test('pre-v41 PGLite brain (missing import_filename + salience_touched_at) re-runs initSchema cleanly', async () => { + const engine = new PGLiteEngine(); + await engine.connect({}); + try { + await engine.initSchema(); + const db = (engine as any).db; + + // Rewind to pre-v41 — drop import_filename + salience_touched_at. + await db.exec(` + ALTER TABLE pages DROP COLUMN IF EXISTS import_filename CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS salience_touched_at CASCADE; + UPDATE config SET value = '40' WHERE key = 'version'; + `); + + await engine.initSchema(); + + const versionRow = await db.query(`SELECT value FROM config WHERE key = 'version'`); + expect(Number(versionRow.rows[0].value)).toBe(LATEST_VERSION); + + const importFn = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='pages' AND column_name='import_filename'`, + ); + expect(importFn.rows.length).toBeGreaterThan(0); + + const salience = await db.query( + `SELECT column_name FROM information_schema.columns + WHERE table_schema='public' AND table_name='pages' AND column_name='salience_touched_at'`, + ); + expect(salience.rows.length).toBeGreaterThan(0); + } finally { + await engine.disconnect(); + } + }); + + test('pre-v34 brain (compounded v0.20 + v0.26.3 + v39-v41 wedge) walks forward cleanly', async () => { + // The "user stuck on v0.20-era PGLite brain hitting v0.30.0" scenario: + // multiple bootstrap forward-reference gaps compounded. This is the + // headline upgrade-path claim in the v0.30.3 release notes. + const engine = new PGLiteEngine(); + await engine.connect({}); + try { + await engine.initSchema(); + const db = (engine as any).db; + + await db.exec(` + -- v0.20 surface (Cathedral II columns) + DROP INDEX IF EXISTS idx_chunks_search_vector; + DROP INDEX IF EXISTS idx_chunks_symbol_qualified; + DROP TRIGGER IF EXISTS chunk_search_vector_trigger ON content_chunks; + DROP FUNCTION IF EXISTS update_chunk_search_vector; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS parent_symbol_path CASCADE; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS doc_comment CASCADE; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS symbol_name_qualified CASCADE; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS search_vector CASCADE; + + -- v0.26.3 surface + DROP INDEX IF EXISTS idx_mcp_log_agent_time; + ALTER TABLE mcp_request_log DROP COLUMN IF EXISTS agent_name CASCADE; + ALTER TABLE mcp_request_log DROP COLUMN IF EXISTS params CASCADE; + ALTER TABLE mcp_request_log DROP COLUMN IF EXISTS error_message CASCADE; + + -- v0.27 surface + DROP INDEX IF EXISTS idx_subagent_messages_provider; + ALTER TABLE subagent_messages DROP COLUMN IF EXISTS provider_id CASCADE; + + -- v39-v41 surface (the wave's bootstrap fixes) + DROP INDEX IF EXISTS idx_chunks_embedding_image; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS embedding_image CASCADE; + ALTER TABLE content_chunks DROP COLUMN IF EXISTS modality CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS emotional_weight CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS effective_date_source CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS import_filename CASCADE; + ALTER TABLE pages DROP COLUMN IF EXISTS salience_touched_at CASCADE; + + UPDATE config SET value = '13' WHERE key = 'version'; + `); + + // Walk all the way forward from a deeply-rewound state. + await engine.initSchema(); + + const versionRow = await db.query(`SELECT value FROM config WHERE key = 'version'`); + expect(Number(versionRow.rows[0].value)).toBe(LATEST_VERSION); + } finally { + await engine.disconnect(); + } + }); +}); From c7b5038a1f600983208db0a59b64e9ffc555d84b Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 23:25:51 -0700 Subject: [PATCH 34/41] fix(test): rename migration-v0-29-1 to .serial.test.ts (CI lint) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's check-test-isolation lint flags the test for direct process.env.GBRAIN_HOME mutation in beforeEach (rule R1: parallel-test-unsafe). The test is genuinely env-coupled — it sets GBRAIN_HOME so loadConfig() inside the migration phases finds the test fixture. Per CLAUDE.md ("When to quarantine instead of fix") and the lint's own fix hint, env-coupled tests get renamed to *.serial.test.ts to run in the serial bucket. Verified: bash scripts/check-test-isolation.sh now reports OK; the renamed test still runs green (1 pass / 0 fail, ~1.5s). --- ...migration-v0-29-1.test.ts => migration-v0-29-1.serial.test.ts} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{migration-v0-29-1.test.ts => migration-v0-29-1.serial.test.ts} (100%) diff --git a/test/migration-v0-29-1.test.ts b/test/migration-v0-29-1.serial.test.ts similarity index 100% rename from test/migration-v0-29-1.test.ts rename to test/migration-v0-29-1.serial.test.ts From 4a0a34577a7fc36394aec94492562ac16aea5b72 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 8 May 2026 23:25:51 -0700 Subject: [PATCH 35/41] =?UTF-8?q?fix(types):=20voyageCompatFetch=20?= =?UTF-8?q?=E2=80=94=20cast=20through=20unknown=20for=20Bun=20typeof=20fet?= =?UTF-8?q?ch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's tsc --noEmit failed: src/core/ai/gateway.ts(249,7): error TS2741: Property 'preconnect' is missing in type '(input: RequestInfo | URL, init: RequestInit | ...) => Promise' but required in type 'typeof fetch'. Bun's @types/bun extends the standard fetch type with a preconnect method that arrow functions can't satisfy. The AI SDK only invokes the call signature; the Bun extension surface is irrelevant to voyageCompatFetch's behavior. Cast through `unknown` (TS2352-safe pattern for cross-type-family casts) with explicit param types on the arrow function. Comment names the exact TS2741 the cast suppresses so a future maintainer can audit the choice. Companion to #735 (Voyage encoding-format adapter) — the original PR introduced voyageCompatFetch typed against typeof fetch; the wave-side typecheck error was caught by CI on the assembled branch. --- src/core/ai/gateway.ts | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/core/ai/gateway.ts b/src/core/ai/gateway.ts index cdea47769..9a0fd6748 100644 --- a/src/core/ai/gateway.ts +++ b/src/core/ai/gateway.ts @@ -246,7 +246,14 @@ export function isAvailable(touchpoint: TouchpointKind): boolean { * The mutated body is what gets sent on the wire; the AI SDK still receives a * base64-encoded response and decodes it as expected. */ -const voyageCompatFetch: typeof fetch = async (input, init) => { +// Cast through `unknown` because Bun's `typeof fetch` extends the standard +// signature with a `preconnect` method that arrow functions can't provide. +// The AI SDK only invokes the call signature; the Bun extension is irrelevant +// here. Without this cast, `tsc --noEmit` fails: +// error TS2741: Property 'preconnect' is missing in type +// '(input: RequestInfo | URL, init: RequestInit | ...) => Promise' +// but required in type 'typeof fetch'. +const voyageCompatFetch = (async (input: RequestInfo | URL, init?: RequestInit) => { // OUTBOUND: rewrite request body for Voyage's actual API contract. if (init?.body && typeof init.body === 'string') { try { @@ -322,7 +329,7 @@ const voyageCompatFetch: typeof fetch = async (input, init) => { // If parsing/transformation fails, fall back to the original response. return resp; } -}; +}) as unknown as typeof fetch; async function resolveEmbeddingProvider(modelStr: string): Promise<{ model: any; recipe: Recipe; modelId: string }> { const { parsed, recipe } = resolveRecipe(modelStr); From 280e0214521118e727f0332459b2ed5fb7d2e2fd Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 00:45:10 -0700 Subject: [PATCH 36/41] fix(test/e2e): rename + update dream-cycle phase-order test The test file said "v0.23 8-phase cycle" but ALL_PHASES has been 9 since v0.26.5 (added `purge`) and 10 since v0.29 (added `recompute_emotional_weight` between patterns and embed). The hardcoded 8-element array assertion was stale documentation. Renamed the file from dream-cycle-eight-phase-pglite.test.ts to dream-cycle-phase-order-pglite.test.ts to make the maintenance contract explicit: this test pins the canonical phase sequence, whatever its current length, against unintended reorderings or removals. Extracted EXPECTED_PHASES as a typed const so the assertion lives in one place and TypeScript's CyclePhase narrowing catches typos in the phase names. --- ...=> dream-cycle-phase-order-pglite.test.ts} | 72 ++++++++++--------- 1 file changed, 39 insertions(+), 33 deletions(-) rename test/e2e/{dream-cycle-eight-phase-pglite.test.ts => dream-cycle-phase-order-pglite.test.ts} (76%) diff --git a/test/e2e/dream-cycle-eight-phase-pglite.test.ts b/test/e2e/dream-cycle-phase-order-pglite.test.ts similarity index 76% rename from test/e2e/dream-cycle-eight-phase-pglite.test.ts rename to test/e2e/dream-cycle-phase-order-pglite.test.ts index 0a8767acc..b6260c9ba 100644 --- a/test/e2e/dream-cycle-eight-phase-pglite.test.ts +++ b/test/e2e/dream-cycle-phase-order-pglite.test.ts @@ -1,21 +1,27 @@ /** - * E2E full 8-phase cycle on PGLite, no API key required. + * E2E full cycle on PGLite, no API key required. * - * Verifies that the v0.23 phase order — lint → backlinks → sync → - * synthesize → extract → patterns → embed → orphans — is honored - * end-to-end through runCycle when no API key is present (synthesize - * + patterns skip cleanly, the other six phases run unchanged). + * Verifies the current phase order is honored end-to-end through runCycle + * when no API key is present (synthesize + patterns skip cleanly, the + * remaining phases run unchanged). + * + * Phase ordering history: + * v0.23 — 8 phases: lint → backlinks → sync → synthesize → extract → + * patterns → embed → orphans + * v0.26.5 — 9 phases (added `purge` last) + * v0.29 — 10 phases (added `recompute_emotional_weight` between patterns + * and embed; `purge` stays last) * * Two regression-relevant invariants: - * 1. CycleReport.phases preserves the 8-phase order — no future + * 1. CycleReport.phases preserves the documented order — no future * reorder regresses without breaking this test. - * 2. CycleReport.totals carries the new v0.23 fields: + * 2. CycleReport.totals carries the v0.23 fields: * transcripts_processed, synth_pages_written, patterns_written. * * No DATABASE_URL required. Mocks embedBatch so the embed phase doesn't * attempt OpenAI calls. * - * Run: bun test test/e2e/dream-cycle-eight-phase-pglite.test.ts + * Run: bun test test/e2e/dream-cycle-phase-order-pglite.test.ts */ import { describe, test, expect, mock } from 'bun:test'; @@ -80,21 +86,30 @@ async function withoutAnthropicKey(body: () => Promise): Promise { } } -describe('E2E v0.23 8-phase cycle', () => { - test('ALL_PHASES is the 8-phase order in the documented sequence', () => { - expect(ALL_PHASES).toEqual([ - 'lint', - 'backlinks', - 'sync', - 'synthesize', - 'extract', - 'patterns', - 'embed', - 'orphans', - ]); +// v0.30.3: phase set has grown from v0.23's 8 phases. The order below is +// the canonical sequence enforced by ALL_PHASES in src/core/cycle.ts. +// Maintenance contract: when a future migration adds or removes a phase, +// extend this constant AND update both assertions below. +type CyclePhase = (typeof ALL_PHASES)[number]; +const EXPECTED_PHASES: CyclePhase[] = [ + 'lint', + 'backlinks', + 'sync', + 'synthesize', + 'extract', + 'patterns', + 'recompute_emotional_weight', // v0.29 + 'embed', + 'orphans', + 'purge', // v0.26.5 +]; + +describe('E2E full cycle phase order', () => { + test('ALL_PHASES matches the documented sequence', () => { + expect(ALL_PHASES).toEqual(EXPECTED_PHASES); }); - test('full cycle on dry-run returns CycleReport.phases in v0.23 order with new totals fields', async () => { + test('full cycle on dry-run returns CycleReport.phases in canonical order with v0.23 totals fields', async () => { const rig = await setupRig(); try { await withoutAnthropicKey(async () => { @@ -102,19 +117,10 @@ describe('E2E v0.23 8-phase cycle', () => { brainDir: rig.brainDir, dryRun: true, }); - // Phase ordering preserved + // Phase ordering preserved across releases const phaseNames = report.phases.map(p => p.phase); - expect(phaseNames).toEqual([ - 'lint', - 'backlinks', - 'sync', - 'synthesize', - 'extract', - 'patterns', - 'embed', - 'orphans', - ]); - // New totals fields exist (v0.23 additive growth) + expect(phaseNames).toEqual(EXPECTED_PHASES); + // v0.23 additive totals fields still present expect(report.totals).toMatchObject({ transcripts_processed: 0, synth_pages_written: 0, From 39769a62673775f49b0bcf2f5ab9b802586513b2 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 00:45:10 -0700 Subject: [PATCH 37/41] fix(test/e2e): cycle.test.ts expects 10 phases (v0.29 added recompute_emotional_weight) Same root cause as dream-cycle-phase-order-pglite.test.ts: hardcoded phase count assertion drifted behind ALL_PHASES growth. Phase history: v0.23 = 8 phases v0.26.5 = 9 (added `purge` last) v0.29 = 10 (added `recompute_emotional_weight` between patterns and embed) --- test/e2e/cycle.test.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/e2e/cycle.test.ts b/test/e2e/cycle.test.ts index be0d1fedb..98bd4e5ba 100644 --- a/test/e2e/cycle.test.ts +++ b/test/e2e/cycle.test.ts @@ -97,9 +97,10 @@ describeE2E('E2E: runCycle against real Postgres', () => { }); expect(report.schema_version).toBe('1'); - // Cycle ran all 9 phases (or skipped the ones that don't support dry-run). - // v0.26.5 added the `purge` phase (9th, after `orphans`). - expect(report.phases.length).toBe(9); + // Cycle ran all 10 phases (or skipped the ones that don't support dry-run). + // Phase history: v0.23 = 8; v0.26.5 added `purge` = 9; v0.29 added + // `recompute_emotional_weight` between `patterns` and `embed` = 10. + expect(report.phases.length).toBe(10); // Nothing got written. const afterPages = await conn.unsafe(`SELECT count(*)::int AS n FROM pages`); From ab07e9a1468ed8cd5b41fbcd54b5e09e0f612621 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 00:45:10 -0700 Subject: [PATCH 38/41] fix(test/e2e): scope GBRAIN_HOME to tmpdir for Doctor Command tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `gbrain doctor`'s minions_migration check reads `~/.gbrain/migrations/completed.jsonl` to detect half-installed migrations. Pre-fix the test inherited the developer's local $HOME, so stale partial entries from in-flight workspaces (e.g. v0.31.0 in santiago) made the check fail and the test exit 1 — masking real DB-health failures. Added per-describe-block `gbrainHome` tmpdir, threaded through `cliEnv()` so all spawned gbrain CLI calls in this block read a hermetic, empty migrations ledger. Cleanup in afterAll. --- test/e2e/mechanical.test.ts | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/test/e2e/mechanical.test.ts b/test/e2e/mechanical.test.ts index 45e93dcdc..60a9678de 100644 --- a/test/e2e/mechanical.test.ts +++ b/test/e2e/mechanical.test.ts @@ -1208,14 +1208,29 @@ describeE2E('E2E: RLS Verification', () => { // ───────────────────────────────────────────────────────────────── describeE2E('E2E: Doctor Command', () => { + // Scope GBRAIN_HOME to a hermetic tmpdir so `gbrain doctor` doesn't read + // the developer's local ~/.gbrain/migrations/completed.jsonl. Stale partial + // entries from in-flight workspaces (e.g. v0.31.x santiago) would make the + // minions_migration check fail and exit 1, masking real DB-health failures. + let gbrainHome: string; + beforeAll(async () => { await setupDB(); await importFixtures(); + gbrainHome = mkdtempSync(join(tmpdir(), 'gbrain-doctor-e2e-')); + }); + afterAll(async () => { + await teardownDB(); + if (gbrainHome) rmSync(gbrainHome, { recursive: true, force: true }); }); - afterAll(teardownDB); const cliCwd = join(import.meta.dir, '../..'); - const cliEnv = () => ({ ...process.env, DATABASE_URL: process.env.DATABASE_URL!, GBRAIN_DATABASE_URL: process.env.DATABASE_URL! }); + const cliEnv = () => ({ + ...process.env, + DATABASE_URL: process.env.DATABASE_URL!, + GBRAIN_DATABASE_URL: process.env.DATABASE_URL!, + GBRAIN_HOME: gbrainHome, + }); test('gbrain doctor exits 0 on healthy DB', () => { // Init first so config exists for CLI From 14110bc9da6f91676ba895f2ab90c25b3ca66af2 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 00:45:10 -0700 Subject: [PATCH 39/41] fix(claw-test): pass --dir explicitly to extract phase (companion to #688) Pre-#688 `gbrain extract` defaulted to cwd. Post-#688 it requires either a configured fs source or explicit --dir, otherwise it errors out: "No brain directory configured." The claw-test scripted scenarios run `gbrain init --pglite` in their install_brain phase, which doesn't register a fs source. So the extract phase needs --dir explicitly. Skip the extract phase entirely when the scenario has no brain dir. Captured brainDir at the import-phase site so it's reusable by extract. --- src/commands/claw-test.ts | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/commands/claw-test.ts b/src/commands/claw-test.ts index 1fe2b9430..2ac7f1e7b 100644 --- a/src/commands/claw-test.ts +++ b/src/commands/claw-test.ts @@ -147,16 +147,25 @@ async function runScripted( phases.push({ name: 'install_brain', argv: ['init', '--pglite'] }); // Phase 3: import (only when scenario has a brain dir) + // Capture brainDir for downstream phases that need an explicit --dir + // (extract requires it post-#688 — defaults to configured source, and + // gbrain init --pglite doesn't register a fs source). + let brainDir: string | undefined; if (scenario.brainRelative) { - const brainDir = join(scenario.dir, scenario.brainRelative); + brainDir = join(scenario.dir, scenario.brainRelative); phases.push({ name: 'import', argv: ['import', brainDir, '--no-embed', '--progress-json'] }); } // Phase 4: query (best-effort sanity) phases.push({ name: 'query', argv: ['query', 'the'] }); - // Phase 5: extract (positional argument is required: 'all' covers links + timeline) - phases.push({ name: 'extract', argv: ['extract', 'all', '--source', 'fs', '--progress-json'] }); + // Phase 5: extract (positional argument is required: 'all' covers links + timeline). + // Pass --dir explicitly because the install_brain phase doesn't register + // a fs source; without --dir, post-#688 extract refuses with "No brain + // directory configured." When the scenario has no brain dir, skip. + if (brainDir) { + phases.push({ name: 'extract', argv: ['extract', 'all', '--source', 'fs', '--dir', brainDir, '--progress-json'] }); + } // Phase 6: verify phases.push({ name: 'verify', argv: ['doctor', '--json', '--progress-json'] }); From c1e2a6d8da8f33b11a4d83cd0b9069890e5c6ef8 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 00:45:10 -0700 Subject: [PATCH 40/41] fix(preferences): route migration ledger paths through gbrainPath() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-fix, preferences.ts used `$HOME/.gbrain` directly via its own `home()` helper. Tests that set `process.env.HOME = tmpdir` expecting hermetic isolation worked — but tests that set `GBRAIN_HOME = tmpdir` (the documented override per `src/core/config.ts`) didn't, because preferences ignored it. Routed prefsDir(), prefsPath(), migrationsDir(), and completedJsonlPath() through gbrainPath() (which honors GBRAIN_HOME, falling back to homedir() when unset). The legacy home() helper stays for any future code path that wants $HOME specifically. Updated three tests that mutated process.env.HOME to also mutate GBRAIN_HOME so the same test body works against the new contract: test/preferences.test.ts, test/migration-resume.test.ts, test/e2e/migration-flow.test.ts. --- src/core/preferences.ts | 17 +++++++++++++---- test/e2e/migration-flow.test.ts | 10 ++++++++++ test/migration-resume.test.ts | 7 +++++++ test/preferences.test.ts | 9 +++++++++ 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/core/preferences.ts b/src/core/preferences.ts index 96484f17b..48db3ad31 100644 --- a/src/core/preferences.ts +++ b/src/core/preferences.ts @@ -11,6 +11,7 @@ import { readFileSync, writeFileSync, renameSync, chmodSync, mkdtempSync, rmSync, existsSync, mkdirSync, appendFileSync } from 'fs'; import { join } from 'path'; import { homedir } from 'os'; +import { gbrainPath } from './config.ts'; function home(): string { // `os.homedir()` in Bun caches its initial value and ignores later @@ -18,6 +19,10 @@ function home(): string { // workflow that needs to run against a specific $HOME (CI, scripted installs). // Prefer the env var; fall back to the cached OS value. Matches the existing // `src/commands/upgrade.ts` pattern. + // + // NOTE: prefsDir() and migrationsDir() route through gbrainPath() (which + // honors GBRAIN_HOME), so this fallback is only used by code paths that + // want $HOME directly (none in this file as of v0.30.3). return process.env.HOME || homedir(); } @@ -55,10 +60,14 @@ export interface CompletedMigrationEntry { const VALID_MODES: ReadonlyArray = ['always', 'pain_triggered', 'off']; -function prefsDir(): string { return join(home(), '.gbrain'); } -function prefsPath(): string { return join(prefsDir(), 'preferences.json'); } -function migrationsDir(): string { return join(home(), '.gbrain', 'migrations'); } -function completedJsonlPath(): string { return join(migrationsDir(), 'completed.jsonl'); } +// Route preferences + migration ledger paths through gbrainPath() so they +// honor GBRAIN_HOME for hermetic test isolation. Pre-v0.30.3, these used +// `$HOME/.gbrain` directly, which leaked the developer's local migration +// ledger into E2E tests and CI runs even when GBRAIN_HOME was set. +function prefsDir(): string { return gbrainPath(); } +function prefsPath(): string { return gbrainPath('preferences.json'); } +function migrationsDir(): string { return gbrainPath('migrations'); } +function completedJsonlPath(): string { return gbrainPath('migrations', 'completed.jsonl'); } /** Validate that a value is a recognized minion mode. Throws with the allowed list. */ export function validateMinionMode(value: unknown): asserts value is MinionMode { diff --git a/test/e2e/migration-flow.test.ts b/test/e2e/migration-flow.test.ts index 9ec49939c..f6de3fa6e 100644 --- a/test/e2e/migration-flow.test.ts +++ b/test/e2e/migration-flow.test.ts @@ -36,6 +36,7 @@ const DATABASE_URL = process.env.DATABASE_URL ?? ''; let tmp: string; let origHome: string | undefined; +let origGbrainHome: string | undefined; let origPath: string | undefined; let fakeBinDir: string; const CLI_PATH = join(import.meta.dir, '..', '..', 'src', 'cli.ts'); @@ -61,7 +62,13 @@ if (!SKIP) { function freshTempHome(label: string) { const dir = mkdtempSync(join(tmpdir(), `gbrain-e2e-migration-${label}-`)); + // v0.30.3: preferences + completed.jsonl now route through gbrainPath() + // which honors GBRAIN_HOME (was: $HOME-only). Set both so the same test + // body works against pre-v0.30.3 and current source — and so other env + // readers that still use $HOME (e.g., shell-spawned subprocesses that + // cd into ~) land in the same hermetic dir. process.env.HOME = dir; + process.env.GBRAIN_HOME = dir; // Seed config so Phase A's `gbrain init --migrate-only` has a target. mkdirSync(join(dir, '.gbrain'), { recursive: true }); writeFileSync( @@ -78,12 +85,15 @@ beforeAll(() => { return; } origHome = process.env.HOME; + origGbrainHome = process.env.GBRAIN_HOME; }); afterAll(() => { if (SKIP) return; if (origHome === undefined) delete process.env.HOME; else process.env.HOME = origHome; + if (origGbrainHome === undefined) delete process.env.GBRAIN_HOME; + else process.env.GBRAIN_HOME = origGbrainHome; if (origPath === undefined) delete process.env.PATH; else process.env.PATH = origPath; try { if (fakeBinDir) rmSync(fakeBinDir, { recursive: true, force: true }); } catch { /* best-effort */ } diff --git a/test/migration-resume.test.ts b/test/migration-resume.test.ts index 11df83a0c..ee02eb57c 100644 --- a/test/migration-resume.test.ts +++ b/test/migration-resume.test.ts @@ -18,15 +18,22 @@ import { tmpdir } from 'os'; let tmpHome: string; const originalHome = process.env.HOME; +const originalGbrainHome = process.env.GBRAIN_HOME; beforeEach(() => { tmpHome = mkdtempSync(join(tmpdir(), 'gbrain-migration-resume-')); + // v0.30.3: appendCompletedMigration / loadCompletedMigrations route + // through gbrainPath() which honors GBRAIN_HOME. Set both so the test + // body works whether preferences read $HOME or GBRAIN_HOME. process.env.HOME = tmpHome; + process.env.GBRAIN_HOME = tmpHome; }); afterEach(() => { if (originalHome) process.env.HOME = originalHome; else delete process.env.HOME; + if (originalGbrainHome) process.env.GBRAIN_HOME = originalGbrainHome; + else delete process.env.GBRAIN_HOME; try { rmSync(tmpHome, { recursive: true, force: true }); } catch { /* ignore */ } }); diff --git a/test/preferences.test.ts b/test/preferences.test.ts index 5001f5236..32f7bae9e 100644 --- a/test/preferences.test.ts +++ b/test/preferences.test.ts @@ -14,17 +14,26 @@ import { } from '../src/core/preferences.ts'; let origHome: string | undefined; +let origGbrainHome: string | undefined; let tmp: string; beforeEach(() => { origHome = process.env.HOME; + origGbrainHome = process.env.GBRAIN_HOME; tmp = mkdtempSync(join(tmpdir(), 'gbrain-prefs-test-')); + // v0.30.3: preferences + completed.jsonl now route through gbrainPath() + // which honors GBRAIN_HOME. Set both so the test body works against any + // future homedir() refactor and so subprocess shells (if any) also land + // in the same hermetic dir. process.env.HOME = tmp; + process.env.GBRAIN_HOME = tmp; }); afterEach(() => { if (origHome === undefined) delete process.env.HOME; else process.env.HOME = origHome; + if (origGbrainHome === undefined) delete process.env.GBRAIN_HOME; + else process.env.GBRAIN_HOME = origGbrainHome; try { rmSync(tmp, { recursive: true, force: true }); } catch { /* best-effort */ } }); From dfe72252bed7b335cfcec8f838d69a24d96f1d58 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 9 May 2026 20:37:37 -0700 Subject: [PATCH 41/41] release: rename version slot to 0.31.1.1-fixwave Originally bumped to 0.31.2 during the master merge to stay strictly monotonic. Garry called the slot back to `0.31.1.1-fixwave` to communicate intent: this is a fix wave on top of v0.31.1, not a new minor or patch slot. The next regular release slot (v0.31.2) stays free for in-flight feature work. Format check: - bun install accepts the literal version (verified) - compareVersions() in src/commands/migrations/index.ts splits on '.' and parseInt's each segment, taking only the first 3. So '0.31.1.1-fixwave' compares as [0,31,1] = equal to '0.31.1' for migration-ordering purposes. Wave has no new schema migrations, so equality is fine. - Compares stable to 0.31.1 in the migration runner; later versions (0.31.2, 0.32.x, etc.) sort strictly above as normal. Updated: - VERSION - package.json (with bun.lock refresh) - CHANGELOG.md entry header + 'To take advantage of' block + 'For contributors' reference - llms.txt + llms-full.txt regenerated to match --- CHANGELOG.md | 10 +++++----- VERSION | 2 +- package.json | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80a563734..f3545fd35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,13 +2,13 @@ All notable changes to GBrain will be documented in this file. -## [0.31.2] - 2026-05-09 +## [0.31.1.1-fixwave] - 2026-05-09 **Security upgrade priority: closes an authorization-code scope-escalation in `gbrain serve --http`. Plus 18 community fix-wave PRs covering upgrade-path correctness, multi-source sync, takes-fence privacy, dream cycle reliability, and CLI hygiene.** -This wave was assembled as v0.31.2 against master at v0.30.2; master then advanced through v0.31.0 (hot-memory facts) and v0.31.1 (thin-client mode) before the wave merged. The version slot bumped to v0.31.2 to stay strictly monotonic. Wave content unchanged. +Versioned as `0.31.1.1-fixwave` to communicate intent: this is a fix wave on top of v0.31.1, not a new minor or patch slot. The next regular release slot (v0.31.2) stays free for in-flight feature work. Originally assembled as v0.30.3 against master at v0.30.2; master then advanced through v0.31.0 (hot-memory facts) and v0.31.1 (thin-client mode) before merge. Wave content unchanged across the rebase. -49 community PRs accumulated since v0.28. v0.31.2 lands the highest-leverage fix wave: 19 PRs across 5 lanes, single PR, atomic per-PR commits for full bisect granularity. The headline is a real security fix: any OAuth client with a `read` scope could mint an authorization code asking for `admin` and the code landed in the database ungated. v0.31.2 closes that. +49 community PRs accumulated since v0.28. This fix wave lands the highest-leverage subset: 19 PRs across 5 lanes, single PR, atomic per-PR commits for full bisect granularity. The headline is a real security fix: any OAuth client with a `read` scope could mint an authorization code asking for `admin` and the code landed in the database ungated. The fix wave closes that. ### Upgrade priority — auth-code scope-escalation (#727) @@ -39,7 +39,7 @@ If you run `gbrain serve --http` with OAuth, **upgrade now**. The pre-fix `autho - **Dream cycle slug lookup** survives double-encoded jsonb in `subagent_tool_executions.input`. The orchestrator no longer silently writes nothing on the "queue green, brain empty" failure mode. ([#745](https://github.com/garrytan/gbrain/pull/745), [@joelwp](https://github.com/joelwp)) - **Voyage embedding adapter** translates between the OpenAI-compat SDK shape and Voyage's actual contract for `encoding_format`. ([#735](https://github.com/garrytan/gbrain/pull/735), [@Freddy-Cach](https://github.com/Freddy-Cach)) -### To take advantage of v0.31.2 +### To take advantage of v0.31.1.1-fixwave ```bash gbrain upgrade @@ -71,7 +71,7 @@ If you run `gbrain serve --http` with OAuth, your existing tokens stay valid. Ne ### For contributors -The v0.31.2 wave shipped after three review passes: CEO scope review (`/plan-ceo-review`), engineering review (`/plan-eng-review`), and codex outside-voice (`/codex`). The codex pass surfaced 9 findings prior reviews missed — most importantly that #727 was misclassified as RFC polish when it's a real auth-code scope-escalation, and that the original commit-shape plan (lane-squashes via `git reset --soft`) would have degraded `git blame` provenance. All 9 codex findings were resolved as decisions C1-C9 in the approved plan; #727 was reclassified to Lane 1 P0 and the wave shipped as 22+ atomic per-PR commits. +The v0.31.1.1-fixwave wave shipped after three review passes: CEO scope review (`/plan-ceo-review`), engineering review (`/plan-eng-review`), and codex outside-voice (`/codex`). The codex pass surfaced 9 findings prior reviews missed — most importantly that #727 was misclassified as RFC polish when it's a real auth-code scope-escalation, and that the original commit-shape plan (lane-squashes via `git reset --soft`) would have degraded `git blame` provenance. All 9 codex findings were resolved as decisions C1-C9 in the approved plan; #727 was reclassified to Lane 1 P0 and the wave shipped as 22+ atomic per-PR commits. The wave was assembled by cherry-picking each PR onto a wave branch, resolving conflicts where master had moved on (the bootstrap chain, the `auth.ts` JSONB-writes seam, the `extract.ts` slug normalization × multi-source sourceId interaction, and the dream-cycle synthesize.ts query merge). Three companion commits patch test expectations or RESOLVER frontmatter declarations that the cherry-picks needed but didn't ship: a `frontmatter` + `check-resolvable` `CLI_ONLY_SELF_HELP` extension (companion to #634), a `discoverTranscripts` test update for `.md` support (companion to #708), and missing RESOLVER trigger declarations in 6 skill frontmatters (companion to #718). diff --git a/VERSION b/VERSION index c415e1c6b..31404ff0d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.31.2 +0.31.1.1-fixwave diff --git a/package.json b/package.json index 41dfcc320..4b0301bd1 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gbrain", - "version": "0.31.2", + "version": "0.31.1.1-fixwave", "description": "Postgres-native personal knowledge brain with hybrid RAG search", "type": "module", "main": "src/core/index.ts",