From 4949aab68b71f540514b3a7aad45828fa6366f5a Mon Sep 17 00:00:00 2001
From: eldar702 <eldarshlomi7@gmail.com>
Date: Sun, 19 Apr 2026 11:13:50 +0300
Subject: [PATCH 001/127] fix: guard None metadata/doc in tool_check_duplicate
 and Layer1/Layer2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
lists of a query/get result for partially-flushed rows. The codebase
already has a systemic None-guard pattern (merged #999, #1013, #1019)
but three call sites were still unguarded:

* ``mcp_server.tool_check_duplicate`` (``mcp_server.py:487-488``) —
  ``meta = results["metadatas"][0][i]`` followed by ``meta.get(...)``
  raises ``AttributeError: 'NoneType' object has no attribute 'get'``.
  The broad ``except Exception`` wrapper (line 504) swallows it and
  returns an uninformative ``"Duplicate check failed"``.

* ``layers.Layer1.generate`` (``layers.py:126``) — iterates
  ``zip(docs, metas)`` and calls ``meta.get(key)`` in the importance
  loop. A single None metadata blows up the entire wake-up render.

* ``layers.Layer2.retrieve`` (``layers.py:224``) — same pattern, same
  crash path for the on-demand render.

Apply the same ``meta = meta or {}`` / ``doc = doc or ""`` idiom used
by the merged guards in the search path. Three-line additions, no
behaviour change on well-formed results.

Tests added:

* ``test_check_duplicate_handles_none_metadata`` — mocks the collection
  query to return ``None`` for one metadata and document, asserts the
  call does not crash and the sentinel-rendered entry has wing/room "?"
  and empty content.
* ``test_layer1_handles_none_metadata`` / ``_handles_none_document``
* ``test_layer2_handles_none_metadata``

Relationship to other open PRs:

* **#1019** guarded ``searcher.py`` loops. This PR extends the same
  guard to the three call sites #1019 did not touch.
* **#979** fixed ``tool_check_duplicate`` negative similarity but left
  the None-metadata path unguarded.
* Does not overlap **#1013** (``Layer3.search_raw``) or **#999**.
---
 mempalace/layers.py      |  4 +++
 mempalace/mcp_server.py  |  6 ++--
 tests/test_layers.py     | 69 ++++++++++++++++++++++++++++++++++++++++
 tests/test_mcp_server.py | 36 +++++++++++++++++++++
 4 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/mempalace/layers.py b/mempalace/layers.py
index a0f9b6d..b20c656 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -124,6 +124,8 @@ def generate(self) -> str:
         # Score each drawer: prefer high importance, recent filing
         scored = []
         for doc, meta in zip(docs, metas):
+            meta = meta or {}
+            doc = doc or ""
             importance = 3
             # Try multiple metadata keys that might carry weight info
             for key in ("importance", "emotional_weight", "weight"):
@@ -222,6 +224,8 @@ def retrieve(self, wing: str = None, room: str = None, n_results: int = 10) -> s
 
         lines = [f"## L2 — ON-DEMAND ({len(docs)} drawers)"]
         for doc, meta in zip(docs[:n_results], metas[:n_results]):
+            meta = meta or {}
+            doc = doc or ""
             room_name = meta.get("room", "?")
             source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
             snippet = doc.strip().replace("\n", " ")
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 06355c4..ae1eb71 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -484,8 +484,10 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
                 dist = results["distances"][0][i]
                 similarity = round(1 - dist, 3)
                 if similarity >= threshold:
-                    meta = results["metadatas"][0][i]
-                    doc = results["documents"][0][i]
+                    # Chroma 1.5.x can return None for partially-flushed rows;
+                    # coerce to empty sentinels so downstream .get() is safe.
+                    meta = results["metadatas"][0][i] or {}
+                    doc = results["documents"][0][i] or ""
                     duplicates.append(
                         {
                             "id": drawer_id,
diff --git a/tests/test_layers.py b/tests/test_layers.py
index 575183f..d4c54ce 100644
--- a/tests/test_layers.py
+++ b/tests/test_layers.py
@@ -655,3 +655,72 @@ def test_memory_stack_status_with_palace(tmp_path):
 
     assert result["total_drawers"] == 42
     assert result["L0_identity"]["exists"] is True
+
+
+# ── Layer1 / Layer2 None-metadata guards ───────────────────────────────
+#
+# Chroma 1.5.x can return ``None`` inside the ``metadatas`` / ``documents``
+# lists for partially-flushed rows. The Layer1.generate() and
+# Layer2.retrieve() loops previously called ``meta.get(...)`` without
+# coercing, raising ``AttributeError: 'NoneType' object has no attribute
+# 'get'`` and blowing up the whole wake-up render. These tests guard that
+# the loops tolerate the None entries and render the rest of the result.
+
+
+def test_layer1_handles_none_metadata():
+    """Layer1.generate tolerates None entries in the metadatas list."""
+    docs = ["important memory", "another memory"]
+    metas = [{"room": "decisions", "source_file": "a.txt"}, None]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.generate()
+
+    assert "ESSENTIAL STORY" in result
+    assert "important memory" in result
+
+
+def test_layer1_handles_none_document():
+    """Layer1.generate tolerates None entries in the documents list."""
+    docs = ["first doc", None]
+    metas = [
+        {"room": "r", "source_file": "a.txt"},
+        {"room": "r", "source_file": "b.txt"},
+    ]
+    mock_col = _mock_chromadb_for_layer(docs, metas)
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer1(palace_path="/fake")
+        result = layer.generate()
+
+    assert result  # Render succeeded despite the None document.
+
+
+def test_layer2_handles_none_metadata():
+    """Layer2.retrieve tolerates None entries in the metadatas list."""
+    mock_col = MagicMock()
+    mock_col.get.return_value = {
+        "documents": ["first doc", "second doc"],
+        "metadatas": [{"room": "r", "source_file": "a.txt"}, None],
+    }
+
+    with (
+        patch("mempalace.layers.MempalaceConfig") as mock_cfg,
+        patch("mempalace.layers._get_collection", return_value=mock_col),
+    ):
+        mock_cfg.return_value.palace_path = "/fake"
+        layer = Layer2(palace_path="/fake")
+        # Should not raise AttributeError on the None entry.
+        result = layer.retrieve()
+
+    assert "L2 — ON-DEMAND" in result
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 899e6a7..e376f43 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -9,6 +9,7 @@
 from datetime import datetime
 import json
 import sys
+from unittest.mock import MagicMock
 
 import pytest
 
@@ -495,6 +496,41 @@ def test_delete_drawer_not_found(self, monkeypatch, config, palace_path, seeded_
         result = tool_delete_drawer("nonexistent_drawer")
         assert result["success"] is False
 
+    def test_check_duplicate_handles_none_metadata(self, monkeypatch, config, kg):
+        """tool_check_duplicate must tolerate None entries in the result lists
+        that ChromaDB 1.5.x returns for partially-flushed rows.
+
+        Previously ``meta = results["metadatas"][0][i]`` was unguarded and
+        raised ``AttributeError: 'NoneType' object has no attribute 'get'``
+        the moment the first matching drawer came back with None metadata —
+        surfacing to the MCP client as the uninformative
+        ``"Duplicate check failed"`` because the broad ``except Exception``
+        wrapper swallows the real cause.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "ids": [["d1", "d2"]],
+            "distances": [[0.05, 0.05]],
+            "metadatas": [[{"wing": "w", "room": "r"}, None]],
+            "documents": [["first doc", None]],
+        }
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: mock_col)
+
+        result = mcp_server.tool_check_duplicate("any content", threshold=0.5)
+
+        # Both entries land in matches (above threshold), None ones rendered
+        # with sentinel values rather than crashing the whole response.
+        assert result.get("is_duplicate") is True
+        assert len(result["matches"]) == 2
+        # The None-metadata entry falls back to sentinels.
+        none_entry = result["matches"][1]
+        assert none_entry["wing"] == "?"
+        assert none_entry["room"] == "?"
+        assert none_entry["content"] == ""
+
     def test_check_duplicate(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
         from mempalace.mcp_server import tool_check_duplicate

From 35b033d77ff378b9ff2999bff965d639e0672039 Mon Sep 17 00:00:00 2001
From: alonehobo <sergey.martemyanov@gmail.com>
Date: Tue, 21 Apr 2026 12:33:58 +0500
Subject: [PATCH 002/127] fix(mcp): force UTF-8 on stdio to fix -32000 on
 non-ASCII payloads

On Windows, Python defaults sys.stdin/sys.stdout to the system codepage
(e.g. cp1251 on Russian locales, cp1252 on Western European), while MCP
JSON-RPC is always UTF-8. Non-ASCII payloads (Cyrillic, CJK, accented
European) get mis-decoded before reaching handlers, causing json.loads
to fail or tool handlers to receive garbled strings. Both surface to
the client as a generic MCP error -32000.

Reproduction:
  1. On Windows with a non-Latin locale, call mempalace_add_drawer or
     mempalace_kg_add with Cyrillic/CJK in content or KG object.
  2. Server returns: MCP error -32000: Internal tool error.
  3. Calling the handler directly from Python works fine -- the bug is
     purely in the stdio transport.

Fix:
  Reconfigure stdin/stdout to UTF-8 at the start of main(), after
  _restore_stdout(). Uses errors="replace" defensively so a lone bad
  byte cannot take down the server. Guarded by hasattr(reconfigure)
  for exotic stream replacements.

This matches the behaviour of PYTHONUTF8=1 / python -X utf8 without
requiring users to set an env var.
---
 mempalace/mcp_server.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 6fe8225..48e8031 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1689,6 +1689,16 @@ def _restore_stdout():
 
 def main():
     _restore_stdout()
+    # Force UTF-8 on stdio. MCP JSON-RPC is UTF-8, but Python on Windows
+    # defaults stdin/stdout to the system codepage (e.g. cp1251), which
+    # corrupts non-ASCII payloads and surfaces as generic -32000 errors on
+    # Cyrillic/CJK content. See PEP 540.
+    for stream in (sys.stdin, sys.stdout):
+        if hasattr(stream, "reconfigure"):
+            try:
+                stream.reconfigure(encoding="utf-8", errors="replace")
+            except (AttributeError, OSError):
+                pass
     logger.info("MemPalace MCP Server starting...")
     while True:
         try:

From c2e053176cf4194fedde2a3361c2785e16426008 Mon Sep 17 00:00:00 2001
From: Sathvik-1007 <spacetime1007@gmail.com>
Date: Thu, 23 Apr 2026 01:40:38 +0530
Subject: [PATCH 003/127] fix: add total count to tool_list_drawers pagination
 response
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The list_drawers response only included count (current page size) with
no total field, making it impossible for callers to know when pagination
is exhausted. A page returning count == limit is ambiguous — it could
be the last exact-fit page or there could be more results.

Add a total field that reports the full number of matching drawers.
For unfiltered requests this uses col.count(); for filtered requests
(wing/room) it uses a lightweight col.get(include=[]) to count
matching IDs without fetching documents.
---
 mempalace/mcp_server.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 6fe8225..e5057af 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -746,6 +746,13 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
             kwargs["where"] = where
         result = col.get(**kwargs)
 
+        # Compute total matching drawers for pagination.
+        if where:
+            total_result = col.get(where=where, include=[])
+            total = len(total_result["ids"])
+        else:
+            total = col.count()
+
         drawers = []
         for i, did in enumerate(result["ids"]):
             meta = result["metadatas"][i]
@@ -760,6 +767,7 @@ def tool_list_drawers(wing: str = None, room: str = None, limit: int = 20, offse
             )
         return {
             "drawers": drawers,
+            "total": total,
             "count": len(drawers),
             "offset": offset,
             "limit": limit,
@@ -1436,7 +1444,7 @@ def tool_reconnect():
         "handler": tool_get_drawer,
     },
     "mempalace_list_drawers": {
-        "description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, and content previews.",
+        "description": "List drawers with pagination. Optional wing/room filter. Returns IDs, wings, rooms, content previews, and total matching count for pagination.",
         "input_schema": {
             "type": "object",
             "properties": {

From 0b8c2c158f1e9edbd14aac1d9b85abfd2bed97e4 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 13:00:27 +0200
Subject: [PATCH 004/127] fix(kg): reject inverted intervals in add_triple
 (valid_to < valid_from)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A triple with valid_to < valid_from satisfies neither of the temporal
filter clauses in query_entity():

    valid_from <= as_of AND valid_to >= as_of

so the triple is invisible to every query — silently corrupt. Reject
at write time with a clear error instead of letting bad data pile up
in the SQLite store.

The guard only fires when both bounds are present; open intervals
(only valid_from or only valid_to) are still accepted, and same-day
intervals (valid_from == valid_to, point-in-time facts) are explicitly
allowed.
---
 mempalace/knowledge_graph.py  |  9 +++++++++
 tests/test_knowledge_graph.py | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 9096ab2..30055a1 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -171,6 +171,15 @@ def add_triple(
             add_triple("Max", "does", "swimming", valid_from="2025-01-01")
             add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
         """
+        # Reject inverted intervals: a triple with valid_to < valid_from
+        # would never satisfy `valid_from <= as_of AND valid_to >= as_of`,
+        # so it would be invisible to every query — silently corrupt.
+        if valid_from is not None and valid_to is not None and valid_to < valid_from:
+            raise ValueError(
+                f"valid_to={valid_to!r} is before valid_from={valid_from!r}; "
+                "an inverted interval would be invisible to every KG query"
+            )
+
         sub_id = self._entity_id(subject)
         obj_id = self._entity_id(obj)
         pred = predicate.lower().replace(" ", "_")
diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py
index d7d9838..6eeb8d3 100644
--- a/tests/test_knowledge_graph.py
+++ b/tests/test_knowledge_graph.py
@@ -5,6 +5,8 @@
 timeline, stats, and edge cases (duplicate triples, ID collisions).
 """
 
+import pytest
+
 
 class TestEntityOperations:
     def test_add_entity(self, kg):
@@ -45,6 +47,38 @@ def test_invalidated_triple_allows_re_add(self, kg):
         tid2 = kg.add_triple("Alice", "works_at", "Acme")
         assert tid1 != tid2  # new triple since old one was closed
 
+    def test_add_triple_rejects_inverted_interval(self, kg):
+        # valid_to before valid_from would never satisfy
+        # `valid_from <= as_of AND valid_to >= as_of` — silently invisible
+        # to every query. Reject at write time instead.
+        with pytest.raises(ValueError, match="before valid_from"):
+            kg.add_triple(
+                "Alice",
+                "worked_at",
+                "Acme",
+                valid_from="2026-03-01",
+                valid_to="2026-02-01",
+            )
+
+    def test_add_triple_accepts_equal_dates(self, kg):
+        # Same-day intervals are valid (point-in-time facts).
+        tid = kg.add_triple(
+            "Alice",
+            "joined",
+            "Acme",
+            valid_from="2026-03-15",
+            valid_to="2026-03-15",
+        )
+        assert tid.startswith("t_alice_joined_acme_")
+
+    def test_add_triple_allows_only_one_bound(self, kg):
+        # The guard only fires when BOTH bounds are set.
+        tid1 = kg.add_triple("Alice", "knows", "Bob", valid_from="2026-01-01")
+        assert tid1.startswith("t_alice_knows_bob_")
+        kg.invalidate("Alice", "knows", "Bob", ended="2026-02-01")
+        tid2 = kg.add_triple("Alice", "knew", "Bob", valid_to="2026-03-01")
+        assert tid2.startswith("t_alice_knew_bob_")
+
 
 class TestQueries:
     def test_query_outgoing(self, seeded_kg):

From f30fdf2672f340c773d4f0554b0017abb48ed4b9 Mon Sep 17 00:00:00 2001
From: imtylervo <toanntq@gmail.com>
Date: Mon, 27 Apr 2026 14:16:20 +1000
Subject: [PATCH 005/127] fix: serialize ChromaCollection writes through palace
 lock

#976 protects `mempalace mine`, but MCP/direct backend writers still call
ChromaCollection.add/upsert/update/delete without the palace lock. This
moves the lock boundary to the Chroma backend seam so all Chroma writes
share the same palace-level serialization, with a re-entrant guard for
miner paths that already hold the lock.

mine_palace_lock(palace_path) gains a per-thread re-entrant guard
(threading.local + pid-tag against fork inheritance) so
ChromaCollection write methods can take the lock without
self-deadlocking when called from inside miner.mine()'s outer hold.

ChromaCollection.__init__ accepts an optional palace_path; when set,
add/upsert/update/delete wrap their underlying chromadb call with
mine_palace_lock(palace_path). palace_path=None preserves the legacy
no-lock behaviour for direct callers and tests. ChromaBackend's
get_collection/create_collection pass palace_path through;
mcp_server._get_collection forwards _config.palace_path so all MCP
write tools inherit the wrapping.

Tests: 5 new in tests/test_chroma_collection_lock.py covering opt-in,
writer-blocks-during-mine, re-entrant-inside-mine, two-process
serialization, and a source-level read-path-not-locked pin. Plus 1 new
+ 1 rewritten in tests/test_palace_locks.py for the re-entrant
semantics. 52 passed in 1.01s including the existing test_backends.py
regression suite.

Refs #1161.
---
 mempalace/backends/chroma.py         |  54 ++++-
 mempalace/mcp_server.py              |   4 +-
 mempalace/palace.py                  |  59 ++++-
 tests/test_chroma_collection_lock.py | 327 +++++++++++++++++++++++++++
 tests/test_palace_locks.py           |  70 +++++-
 5 files changed, 497 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_chroma_collection_lock.py

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index ad7748f..d438236 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -1,5 +1,6 @@
 """ChromaDB-backed MemPalace storage backend (RFC 001 reference implementation)."""
 
+import contextlib
 import datetime as _dt
 import logging
 import os
@@ -573,10 +574,43 @@ def _as_list(v: Any) -> list:
 
 
 class ChromaCollection(BaseCollection):
-    """Thin adapter translating ChromaDB dict returns into typed results."""
+    """Thin adapter translating ChromaDB dict returns into typed results.
+
+    When ``palace_path`` is set, all write methods (``add``, ``upsert``,
+    ``update``, ``delete``) acquire ``mine_palace_lock(palace_path)`` for the
+    duration of the underlying chromadb call. This serializes MCP and other
+    direct-backend writers against ``mempalace mine`` and against each other,
+    closing the race between concurrent writers that triggers ChromaDB's
+    multi-threaded HNSW corruption (#974/#965).
+
+    The lock is the same primitive used by ``miner.mine()`` so re-entrant
+    acquisition from inside the mine pipeline (mine -> _mine_body ->
+    collection.upsert) is short-circuited by the per-thread guard inside
+    ``mine_palace_lock`` — no self-deadlock.
+
+    ``palace_path=None`` disables the wrapping, preserving the legacy
+    no-lock behaviour for callers that construct a ``ChromaCollection``
+    directly without going through ``ChromaBackend``.
+    """
 
-    def __init__(self, collection):
+    def __init__(self, collection, palace_path: Optional[str] = None):
         self._collection = collection
+        self._palace_path = palace_path
+
+    @contextlib.contextmanager
+    def _write_lock(self):
+        """Acquire ``mine_palace_lock`` for the configured palace, if any.
+
+        No-op (yields immediately) when ``self._palace_path`` is None.
+        """
+        if self._palace_path is None:
+            yield
+            return
+        # Late import — palace.py imports ChromaBackend from this module.
+        from ..palace import mine_palace_lock
+
+        with mine_palace_lock(self._palace_path):
+            yield
 
     # ------------------------------------------------------------------
     # Writes
@@ -588,7 +622,8 @@ def add(self, *, documents, ids, metadatas=None, embeddings=None):
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.add(**kwargs)
+        with self._write_lock():
+            self._collection.add(**kwargs)
 
     def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
         kwargs: dict[str, Any] = {"documents": documents, "ids": ids}
@@ -596,7 +631,8 @@ def upsert(self, *, documents, ids, metadatas=None, embeddings=None):
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.upsert(**kwargs)
+        with self._write_lock():
+            self._collection.upsert(**kwargs)
 
     def update(
         self,
@@ -615,7 +651,8 @@ def update(
             kwargs["metadatas"] = metadatas
         if embeddings is not None:
             kwargs["embeddings"] = embeddings
-        self._collection.update(**kwargs)
+        with self._write_lock():
+            self._collection.update(**kwargs)
 
     # ------------------------------------------------------------------
     # Reads
@@ -759,7 +796,8 @@ def delete(self, *, ids=None, where=None):
             kwargs["ids"] = ids
         if where is not None:
             kwargs["where"] = where
-        self._collection.delete(**kwargs)
+        with self._write_lock():
+            self._collection.delete(**kwargs)
 
     def count(self):
         return self._collection.count()
@@ -998,7 +1036,7 @@ def get_collection(
         else:
             collection = client.get_collection(collection_name, **ef_kwargs)
         _pin_hnsw_threads(collection)
-        return ChromaCollection(collection)
+        return ChromaCollection(collection, palace_path=palace_path)
 
     def close_palace(self, palace) -> None:
         """Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
@@ -1045,7 +1083,7 @@ def create_collection(
             metadata={"hnsw:space": hnsw_space, "hnsw:num_threads": 1},
             **ef_kwargs,
         )
-        return ChromaCollection(collection)
+        return ChromaCollection(collection, palace_path=palace_path)
 
 
 def _normalize_get_collection_args(args, kwargs):
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 9cc454e..7e22227 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -288,13 +288,13 @@ def _get_collection(create=False):
                 metadata={"hnsw:space": "cosine", "hnsw:num_threads": 1},
             )
             _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw)
+            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
             _metadata_cache = None
             _metadata_cache_time = 0
         elif _collection_cache is None:
             raw = client.get_collection(_config.collection_name)
             _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw)
+            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
             _metadata_cache = None
             _metadata_cache_time = 0
         return _collection_cache
diff --git a/mempalace/palace.py b/mempalace/palace.py
index 07efb6a..97f67ff 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -8,6 +8,7 @@
 import hashlib
 import os
 import re
+import threading
 
 from .backends.chroma import ChromaBackend
 
@@ -314,6 +315,47 @@ class MineAlreadyRunning(RuntimeError):
     """Raised when another `mempalace mine` already holds the per-palace lock."""
 
 
+# Per-thread record of palaces this thread already holds the lock for. Used by
+# `mine_palace_lock` to short-circuit re-entrant acquisition from the same
+# thread (e.g. miner.mine() acquires the outer lock then calls
+# ChromaCollection.upsert which now also tries to acquire). Without this guard
+# the inner call would block on its own outer flock (Linux fcntl locks are per
+# open file description, so a same-thread second open of the lock file is a
+# distinct lock and self-deadlocks).
+#
+# The holder set is tagged with ``pid`` so that a forked child does NOT
+# inherit re-entrant credit from its parent: the OS-level flock IS NOT
+# inherited as a "we hold it" semantically — the child must reacquire — but
+# Python's ``threading.local`` IS inherited across fork. The pid check
+# clears stale state so a forked child correctly hits the fcntl path.
+_palace_lock_holders = threading.local()
+
+
+def _holder_state():
+    """Return the per-thread (pid, keys) record, refreshing after fork."""
+    keys = getattr(_palace_lock_holders, "keys", None)
+    pid = getattr(_palace_lock_holders, "pid", None)
+    current_pid = os.getpid()
+    if keys is None or pid != current_pid:
+        keys = set()
+        _palace_lock_holders.keys = keys
+        _palace_lock_holders.pid = current_pid
+    return keys
+
+
+def _held_by_this_thread(lock_key: str) -> bool:
+    """Return True if this thread already holds ``mine_palace_lock`` for ``lock_key``."""
+    return lock_key in _holder_state()
+
+
+def _mark_held(lock_key: str) -> None:
+    _holder_state().add(lock_key)
+
+
+def _mark_released(lock_key: str) -> None:
+    _holder_state().discard(lock_key)
+
+
 @contextlib.contextmanager
 def mine_palace_lock(palace_path: str):
     """Per-palace non-blocking lock around the full `mine` pipeline.
@@ -338,6 +380,12 @@ def mine_palace_lock(palace_path: str):
     Non-blocking: if another `mine` is already writing to this palace,
     raise MineAlreadyRunning so the caller can exit cleanly instead of
     piling up as a waiting worker.
+
+    Re-entrant: if the current thread already holds the lock for the same
+    palace, the context manager passes through without re-acquiring. This
+    lets ChromaCollection write methods (which acquire the lock themselves
+    to protect MCP/direct callers) compose with miner.mine() (which holds
+    the outer lock for the entire mine pipeline) without self-deadlock.
     """
     lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
     os.makedirs(lock_dir, exist_ok=True)
@@ -346,6 +394,11 @@ def mine_palace_lock(palace_path: str):
     palace_key = hashlib.sha256(lock_key_source.encode()).hexdigest()[:16]
     lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
 
+    if _held_by_this_thread(palace_key):
+        # Same thread already holds the lock for this palace — pass through.
+        yield
+        return
+
     lf = open(lock_path, "w")
     acquired = False
     try:
@@ -369,7 +422,11 @@ def mine_palace_lock(palace_path: str):
                 raise MineAlreadyRunning(
                     f"another `mempalace mine` is already running against {resolved}"
                 ) from exc
-        yield
+        _mark_held(palace_key)
+        try:
+            yield
+        finally:
+            _mark_released(palace_key)
     finally:
         if acquired:
             try:
diff --git a/tests/test_chroma_collection_lock.py b/tests/test_chroma_collection_lock.py
new file mode 100644
index 0000000..b5d30fb
--- /dev/null
+++ b/tests/test_chroma_collection_lock.py
@@ -0,0 +1,327 @@
+"""Tests for ChromaCollection's palace-write-lock integration.
+
+Closes the gap left by ``mine_palace_lock`` only protecting the
+``mempalace mine`` pipeline: MCP/direct writers that call
+``ChromaCollection.add/upsert/update/delete`` must also serialize against
+mine and against each other to avoid the multi-threaded HNSW corruption
+documented in #974/#965.
+
+Property tested:
+
+* ``ChromaCollection(c, palace_path=p)`` wraps every write with
+  ``mine_palace_lock(p)``.
+* Writes raise ``MineAlreadyRunning`` when another holder owns the lock
+  (instead of silently racing into the underlying chromadb call).
+* Re-entrant composition with ``miner.mine()`` does not self-deadlock:
+  ``with mine_palace_lock(p): col.upsert(...)`` runs to completion.
+* ``ChromaCollection(c)`` (no palace_path) preserves legacy no-lock
+  behaviour for tests/callers that build the adapter directly without
+  going through ``ChromaBackend``.
+
+POSIX-only: ``mine_palace_lock`` uses ``fcntl`` on Unix and ``msvcrt`` on
+Windows; the contention semantics differ enough that the cross-process
+tests are skipped on Windows runners.
+"""
+
+from __future__ import annotations
+
+import multiprocessing
+import os
+import time
+
+import pytest
+
+from mempalace.backends.chroma import ChromaCollection
+from mempalace.palace import MineAlreadyRunning, mine_palace_lock
+
+
+def _get_mp_context():
+    """Same start-method picker as test_palace_locks.py."""
+    start_method = "spawn" if os.name == "nt" else "fork"
+    return multiprocessing.get_context(start_method)
+
+
+# ---------------------------------------------------------------------------
+# Fakes
+# ---------------------------------------------------------------------------
+
+
+class _FakeChromaCollection:
+    """Records calls; never blocks. Stand-in for chromadb.Collection."""
+
+    def __init__(self):
+        self.adds: list[dict] = []
+        self.upserts: list[dict] = []
+        self.updates: list[dict] = []
+        self.deletes: list[dict] = []
+
+    def add(self, **kwargs):
+        self.adds.append(kwargs)
+
+    def upsert(self, **kwargs):
+        self.upserts.append(kwargs)
+
+    def update(self, **kwargs):
+        self.updates.append(kwargs)
+
+    def delete(self, **kwargs):
+        self.deletes.append(kwargs)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _hold_lock(palace_path: str, ready_flag: str, release_flag: str) -> int:
+    """Acquire ``mine_palace_lock``, signal readiness, wait for release.
+
+    Mirrors the helper in ``test_palace_locks.py`` so the contention
+    semantics match across both test files.
+    """
+    try:
+        with mine_palace_lock(palace_path):
+            open(ready_flag, "w").close()
+            for _ in range(500):
+                if os.path.exists(release_flag):
+                    return 0
+                time.sleep(0.01)
+            return 0
+    except MineAlreadyRunning:
+        return 1
+
+
+# ---------------------------------------------------------------------------
+# Tests — opt-in lock wiring
+# ---------------------------------------------------------------------------
+
+
+def test_palace_path_none_skips_lock(tmp_path, monkeypatch):
+    """Legacy callers (``ChromaCollection(c)``) keep no-lock behaviour.
+
+    A ``ChromaCollection`` built without ``palace_path`` must not touch the
+    lock infrastructure at all. This guards against regressions where a
+    test or third-party caller relies on the historical bare-write path.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake)  # no palace_path -> no lock
+
+    # Hold the lock in a child process. Without palace_path, the parent
+    # write must still succeed (the lock does not gate this caller).
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        col.upsert(documents=["doc"], ids=["id-1"])
+        assert fake.upserts == [{"documents": ["doc"], "ids": ["id-1"]}]
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_writer_blocks_during_mine(tmp_path, monkeypatch):
+    """A held ``mine_palace_lock`` causes ``ChromaCollection`` writes to raise.
+
+    This is the property that closes the MCP-bypass gap: when a mine is in
+    flight, MCP/direct writes raise ``MineAlreadyRunning`` rather than
+    silently entering chromadb's write path concurrent with mine.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        fake = _FakeChromaCollection()
+        col = ChromaCollection(fake, palace_path=palace)
+
+        with pytest.raises(MineAlreadyRunning):
+            col.upsert(documents=["doc"], ids=["id-1"])
+        with pytest.raises(MineAlreadyRunning):
+            col.add(documents=["doc"], ids=["id-2"])
+        with pytest.raises(MineAlreadyRunning):
+            col.update(ids=["id-3"], documents=["doc"])
+        with pytest.raises(MineAlreadyRunning):
+            col.delete(ids=["id-4"])
+
+        # The fake must have received NO calls — the lock must gate
+        # before reaching the underlying chromadb layer.
+        assert fake.upserts == []
+        assert fake.adds == []
+        assert fake.updates == []
+        assert fake.deletes == []
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_inside_mine_passes_through(tmp_path, monkeypatch):
+    """``ChromaCollection.upsert`` inside ``mine_palace_lock`` does not deadlock.
+
+    ``miner.mine()`` already holds ``mine_palace_lock(palace_path)`` for the
+    full mine pipeline; ``_mine_body`` then calls
+    ``collection.upsert(...)``. With the per-thread re-entrant guard in
+    ``mine_palace_lock``, the inner acquire is a pass-through and the
+    underlying chromadb call runs immediately.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    fake = _FakeChromaCollection()
+    col = ChromaCollection(fake, palace_path=palace)
+
+    with mine_palace_lock(palace):
+        # If the re-entrant guard were missing, this would self-deadlock on
+        # the underlying flock. We rely on pytest-timeout (configured in
+        # pyproject.toml) to enforce this in CI; the assertion just confirms
+        # the call landed.
+        col.upsert(documents=["d"], ids=["i"], metadatas=[{"k": "v"}])
+        col.add(documents=["d2"], ids=["i2"])
+        col.update(ids=["i"], documents=["d-updated"])
+        col.delete(ids=["i2"])
+
+    assert len(fake.upserts) == 1
+    assert len(fake.adds) == 1
+    assert len(fake.updates) == 1
+    assert len(fake.deletes) == 1
+
+
+class _SlowFakeChromaCollection(_FakeChromaCollection):
+    """Fake whose write methods hold the caller for ``hold_seconds``.
+
+    Used to keep ``mine_palace_lock`` acquired long enough for a sibling
+    process to contend deterministically.
+    """
+
+    def __init__(self, hold_seconds: float = 0.3):
+        super().__init__()
+        self._hold = hold_seconds
+
+    def upsert(self, **kwargs):
+        time.sleep(self._hold)
+        super().upsert(**kwargs)
+
+
+def _slow_writer_target(palace_path, tmp_path_str, pid, result_q):
+    """Subprocess target: try a slow upsert, report ok/busy."""
+    os.environ["HOME"] = tmp_path_str
+    # Fresh import inside child so HOME monkeypatch routes the lock dir.
+    from mempalace.backends.chroma import ChromaCollection as _CC
+    from mempalace.palace import MineAlreadyRunning as _MAR
+
+    fake = _SlowFakeChromaCollection(hold_seconds=0.3)
+    col = _CC(fake, palace_path=palace_path)
+    try:
+        col.upsert(documents=[f"d{pid}"], ids=[f"i{pid}"])
+        result_q.put(("ok", pid))
+    except _MAR:
+        result_q.put(("busy", pid))
+
+
+def test_concurrent_writers_serialize(tmp_path, monkeypatch):
+    """Two processes calling ``ChromaCollection.upsert`` against the same
+    palace must be serialized: at most one enters chromadb at a time, the
+    other raises ``MineAlreadyRunning``.
+
+    This is the property that prevents the parallel HNSW insert race that
+    drives #974/#965 — under concurrent MCP write fan-out, exactly one
+    writer reaches chromadb and the rest fail loudly instead of corrupting
+    the index.
+
+    The slow fake holds the lock for 0.3s per writer, large enough for the
+    second process to contend even on slow CI runners.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+
+    ctx = _get_mp_context()
+    result_q = ctx.Queue()
+
+    p1 = ctx.Process(
+        target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q)
+    )
+    p2 = ctx.Process(
+        target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q)
+    )
+    p1.start()
+    # Tiny stagger so p1 wins the race deterministically; without it the
+    # OS scheduler can pick either, which is also a valid outcome but
+    # makes the assertion brittle on slow CI.
+    time.sleep(0.05)
+    p2.start()
+    p1.join(timeout=5)
+    p2.join(timeout=5)
+
+    outcomes = [result_q.get(timeout=1) for _ in range(2)]
+    statuses = sorted(o[0] for o in outcomes)
+    assert statuses == ["busy", "ok"], (
+        f"expected one ok + one busy, got {outcomes}"
+    )
+
+
+def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
+    """``query`` / ``get`` / ``count`` must not be gated by the write lock.
+
+    Read traffic is the dominant workload (semantic search, MCP get, etc.)
+    and serializing it against mine would tank latency for no correctness
+    benefit. This test pins that property: with another process holding
+    the write lock, reads must still complete instantly.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(palace, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock"
+
+        # _FakeChromaCollection doesn't implement query/get/count; we only
+        # need to confirm the wrapper does not call into mine_palace_lock
+        # for reads, which we assert by observing the wrapped methods are
+        # NOT in ChromaCollection's _write_lock path. A direct check via
+        # source inspection is more honest than mocking the entire chroma
+        # surface here.
+        import inspect
+
+        from mempalace.backends.chroma import ChromaCollection as _CC
+
+        for write_attr in ("add", "upsert", "update", "delete"):
+            src = inspect.getsource(getattr(_CC, write_attr))
+            assert "_write_lock" in src, f"{write_attr} should acquire write lock"
+
+        for read_attr in ("query", "get", "count"):
+            method = getattr(_CC, read_attr, None)
+            if method is None:
+                continue
+            src = inspect.getsource(method)
+            assert "_write_lock" not in src, (
+                f"{read_attr} must NOT acquire the write lock (read path)"
+            )
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 601c894..39aa50c 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -135,19 +135,77 @@ def test_different_palaces_dont_conflict(tmp_path, monkeypatch):
 
 
 def test_palace_path_is_normalized(tmp_path, monkeypatch):
-    """Relative and absolute forms of the same path must use the same lock."""
+    """Relative and absolute forms of the same path must use the same lock.
+
+    Cross-process variant: a child holds the absolute form, a relative form
+    in the parent must hash to the same lock key and raise
+    ``MineAlreadyRunning``. (The same-thread case is now a re-entrant
+    pass-through by design — see ``test_reentrant_same_thread_passes_through``
+    — so we exercise the normalization invariant across a process boundary
+    where re-entrance does not apply.)
+    """
     monkeypatch.setenv("HOME", str(tmp_path))
     monkeypatch.chdir(tmp_path)
     os.makedirs(tmp_path / "palace", exist_ok=True)
     absolute = str(tmp_path / "palace")
-    relative = "palace"
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
 
-    # Hold the lock with the absolute form; attempting to re-acquire with
-    # the relative form (which resolves to the same absolute path) must fail.
-    with mine_palace_lock(absolute):
+    ctx = _get_mp_context()
+    holder = ctx.Process(target=_hold_lock, args=(absolute, ready, release))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock in time"
+
+        # Parent holds CWD = tmp_path so "palace" is the same on-disk dir as
+        # the absolute form. The lock key is sha256(realpath+normcase) so the
+        # two forms must collide.
         with pytest.raises(MineAlreadyRunning):
-            with mine_palace_lock(relative):
+            with mine_palace_lock("palace"):
                 pytest.fail("normalized path collision should have raised")
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
+    """Same thread re-acquiring the same palace lock must not deadlock or raise.
+
+    This is the invariant that makes ``ChromaCollection`` write methods (which
+    take ``mine_palace_lock`` for MCP/direct-writer protection) compose with
+    ``miner.mine()`` (which already holds the lock for the entire mine
+    pipeline). Without the per-thread re-entrant guard the inner acquire
+    would self-deadlock on the outer flock.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    with mine_palace_lock(palace):
+        # Re-enter from the same thread — must yield without raising or hanging.
+        with mine_palace_lock(palace):
+            pass
+        # After the inner exits, the outer is still held: confirm via a
+        # subprocess that tries to acquire and reports back.
+        ctx = _get_mp_context()
+        result_q = ctx.Queue()
+        child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
+        child.start()
+        child.join(timeout=5)
+        assert result_q.get(timeout=1) == "busy", (
+            "outer lock should still be held by parent after inner re-entrant exit"
+        )
+
+
+def _try_acquire_expect_busy(palace_path, result_q):
+    """Helper: try to acquire, push 'busy' (raised) or 'free' (acquired) into queue."""
+    try:
+        with mine_palace_lock(palace_path):
+            result_q.put("free")
+    except MineAlreadyRunning:
+        result_q.put("busy")
 
 
 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):

From db28bf1e846592f997835ca42050e43c12b09303 Mon Sep 17 00:00:00 2001
From: sha2fiddy <103975074+sha2fiddy@users.noreply.github.com>
Date: Wed, 29 Apr 2026 19:01:54 -0400
Subject: [PATCH 006/127] fix: paginate closet_llm col.get (#1073)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the pagination pattern PR #851 landed in miner.py:status().
A single drawers_col.get(limit=total, ...) on palaces larger than
SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) crashes inside chromadb.

Fetch drawers in batch_size=5000 chunks, stepping offset until the
collection is drained. by_source aggregation semantics are preserved
exactly — grouping, wing filter, meta capture all unchanged.

Closes #1073. Related: #802, #850, #1016.
---
 mempalace/closet_llm.py  |  33 +++++---
 tests/test_closet_llm.py | 176 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+), 11 deletions(-)

diff --git a/mempalace/closet_llm.py b/mempalace/closet_llm.py
index c00b735..6274f79 100644
--- a/mempalace/closet_llm.py
+++ b/mempalace/closet_llm.py
@@ -221,17 +221,28 @@ def regenerate_closets(
         print("No drawers in palace.")
         return {"processed": 0}
 
-    all_data = drawers_col.get(limit=total, include=["documents", "metadatas"])
-    by_source = {}
-    for doc_id, doc, meta in zip(all_data["ids"], all_data["documents"], all_data["metadatas"]):
-        source = meta.get("source_file", "unknown")
-        w = meta.get("wing", "")
-        if wing and w != wing:
-            continue
-        if source not in by_source:
-            by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
-        by_source[source]["drawer_ids"].append(doc_id)
-        by_source[source]["content"].append(doc)
+    # Paginate the fetch — a single get(limit=total, ...) blows through
+    # SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) on large palaces and
+    # crashes inside chromadb (see #802, #850, #1073).
+    by_source: dict = {}
+    batch_size = 5000
+    offset = 0
+    while offset < total:
+        batch = drawers_col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
+        ids = batch["ids"]
+        if not ids:
+            break
+        for doc_id, doc, meta in zip(ids, batch["documents"], batch["metadatas"]):
+            meta = meta or {}
+            source = meta.get("source_file", "unknown")
+            w = meta.get("wing", "")
+            if wing and w != wing:
+                continue
+            if source not in by_source:
+                by_source[source] = {"drawer_ids": [], "content": [], "meta": meta}
+            by_source[source]["drawer_ids"].append(doc_id)
+            by_source[source]["content"].append(doc)
+        offset += len(ids)
 
     sources = list(by_source.keys())
     if sample > 0:
diff --git a/tests/test_closet_llm.py b/tests/test_closet_llm.py
index a92e2fa..3a0e84e 100644
--- a/tests/test_closet_llm.py
+++ b/tests/test_closet_llm.py
@@ -296,6 +296,182 @@ def fake_urlopen(req, timeout=None):
             assert meta.get("generated_by", "").startswith("llm:")
             assert meta.get("normalize_version") == NORMALIZE_VERSION
 
+    def test_regen_paginates_drawer_fetch(self, tmp_path):
+        """Regression for #1073: drawers_col.get must be paginated at
+        batch_size=5000. A single get(limit=total, ...) on a palace with
+        more than SQLite's SQLITE_MAX_VARIABLE_NUMBER (32766) drawers
+        blows up inside chromadb. Matches the miner.status pattern
+        introduced in #851 (see #802, #850, #1073)."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # Build a fake collection: 12_000 drawers across 3 source files,
+        # enough to force 3 batches of batch_size=5000 (5000 + 5000 + 2000).
+        n_drawers = 12_000
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"doc body {i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 3}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        get_calls: list = []
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                get_calls.append({"limit": limit, "offset": offset, "include": include})
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            """Accept the purge + upsert calls the success path makes."""
+
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        fake_drawers = FakeDrawersCol()
+        fake_closets = FakeClosetsCol()
+
+        def fake_urlopen(req, timeout=None):
+            return _FakeResp(
+                {
+                    "choices": [
+                        {"message": {"content": '{"topics":["t1"],"quotes":[],"summary":""}'}}
+                    ],
+                    "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                }
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=fake_drawers),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=fake_closets),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+        ):
+            result = regenerate_closets(palace, cfg=cfg, dry_run=True)
+
+        # Three paginated calls: (limit=5000, offset=0), (5000, 5000), (5000, 10000).
+        assert len(get_calls) == 3, f"expected 3 batched fetches, got {len(get_calls)}"
+        for call in get_calls:
+            assert (
+                call["limit"] == 5000
+            ), f"batch must be 5000 — got {call['limit']} (would risk SQLITE_MAX_VARIABLE_NUMBER)"
+            # include must still request both documents and metadatas
+            assert "documents" in call["include"]
+            assert "metadatas" in call["include"]
+        assert [c["offset"] for c in get_calls] == [0, 5000, 10_000]
+
+        # by_source aggregation must be preserved exactly across batches:
+        # 12_000 drawers, 3 source files → 4_000 drawers each.
+        # dry_run=True short-circuits LLM calls but still walks by_source.
+        assert result.get("processed", 0) == 0  # dry_run
+        # Verify no single call tried to pull more than batch_size.
+        assert max(c["limit"] for c in get_calls) <= 5000
+
+    def test_regen_by_source_aggregates_across_batches(self, tmp_path):
+        """Pagination must not change the by_source grouping — drawers for
+        the same source_file split across batches still land in one group."""
+        from mempalace import closet_llm as closet_llm_mod
+
+        palace = str(tmp_path / "palace")
+
+        # 7_500 drawers, alternating between two source files → forces
+        # splits across the 5000/2500 boundary. Each source ends up with
+        # 3_750 drawers after regrouping.
+        n_drawers = 7_500
+        ids = [f"d{i:05d}" for i in range(n_drawers)]
+        docs = [f"body-{i}" for i in range(n_drawers)]
+        metas = [
+            {
+                "wing": "w",
+                "room": "r",
+                "source_file": f"/src/file_{i % 2}.md",
+                "entities": "",
+            }
+            for i in range(n_drawers)
+        ]
+
+        captured_sources: dict = {}
+
+        class FakeDrawersCol:
+            def count(self):
+                return n_drawers
+
+            def get(self, limit=None, offset=0, include=None, **kwargs):
+                end = min(offset + (limit or n_drawers), n_drawers)
+                return {
+                    "ids": ids[offset:end],
+                    "documents": docs[offset:end],
+                    "metadatas": metas[offset:end],
+                }
+
+        class FakeClosetsCol:
+            def get(self, *a, **kw):
+                return {"ids": [], "documents": [], "metadatas": []}
+
+            def delete(self, *a, **kw):
+                return None
+
+            def upsert(self, *a, **kw):
+                return None
+
+        # Hook _call_llm to inspect what regenerate_closets aggregated
+        # per source before the HTTP boundary.
+        real_call_llm = closet_llm_mod._call_llm
+
+        def spying_call_llm(cfg, source_file, wing, room, content):
+            captured_sources[source_file] = content
+            return (
+                {"topics": ["t"], "quotes": [], "summary": ""},
+                {"prompt_tokens": 1, "completion_tokens": 1},
+            )
+
+        cfg = LLMConfig(endpoint="http://local/v1", model="m")
+
+        with (
+            patch.object(closet_llm_mod, "get_collection", return_value=FakeDrawersCol()),
+            patch.object(closet_llm_mod, "get_closets_collection", return_value=FakeClosetsCol()),
+            patch.object(closet_llm_mod, "purge_file_closets", return_value=None),
+            patch.object(closet_llm_mod, "upsert_closet_lines", return_value=None),
+            patch.object(closet_llm_mod, "_call_llm", side_effect=spying_call_llm),
+        ):
+            regenerate_closets(palace, cfg=cfg)
+
+        # Both sources survived the pagination boundary.
+        assert set(captured_sources.keys()) == {"/src/file_0.md", "/src/file_1.md"}
+        # Each source accumulated exactly 3_750 drawer bodies, concatenated
+        # with the "\n\n" separator the regenerate path uses.
+        for source, content in captured_sources.items():
+            assert content.count("\n\n") == 3_749, (
+                f"{source}: expected 3_750 chunks joined (3_749 separators), "
+                f"got {content.count(chr(10) + chr(10)) + 1}"
+            )
+
+        # Silence unused-var lint.
+        assert real_call_llm is not None
+
     def test_regen_uses_basename_not_split_slash(self, tmp_path, monkeypatch):
         """Regression: the old closet_id base used ``source.split('/')[-1]``
         which silently degrades on Windows paths (``C:\\proj\\a.md`` →

From 4d98b0524084f7682f2b7df04c77be2e4312c081 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Fri, 24 Apr 2026 11:09:16 +0200
Subject: [PATCH 007/127] fix(kg): validate ISO-8601 date formats at MCP
 boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tool_kg_query (as_of), tool_kg_add (valid_from), and tool_kg_invalidate
(ended) accepted any string and forwarded it to SQLite without format
validation. Parameterized queries prevent SQL injection, but invalid
date strings silently produce empty result sets — callers cannot
distinguish "no fact at this time" from "your date format was
unrecognized." This is especially painful for natural-language LLM
callers that synthesize dates like "March 2026" or "Jan 2025".

Add sanitize_iso_date() in config.py alongside the other input
validators. It accepts YYYY, YYYY-MM, and YYYY-MM-DD forms; passes
through None/empty; and raises ValueError with a field-named message
on anything else. Call it from the three kg MCP tool wrappers before
values reach the storage layer so the caller gets a clear error
instead of a silent miss.

Closes #1164
---
 mempalace/config.py      | 28 ++++++++++++++++
 mempalace/mcp_server.py  |  4 +++
 tests/test_config.py     | 70 +++++++++++++++++++++++++++++++++++++++-
 tests/test_mcp_server.py | 46 ++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index cacd1f9..4005779 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -81,6 +81,34 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
     return value
 
 
+# ISO-8601 date validator for knowledge-graph temporal parameters
+# (as_of, valid_from, valid_to, ended). Parameterized queries already
+# prevent SQL injection, but unvalidated date strings silently miss
+# every row — callers cannot distinguish "no fact at this time" from
+# "your date format was unrecognized." Accept YYYY, YYYY-MM, YYYY-MM-DD.
+_ISO_DATE_RE = re.compile(r"^\d{4}(?:-(?:0[1-9]|1[0-2])(?:-(?:0[1-9]|[12]\d|3[01]))?)?$")
+
+
+def sanitize_iso_date(value, field_name: str = "date"):
+    """Validate an ISO-8601 date string, accepting None or empty as-is.
+
+    Accepts ``YYYY``, ``YYYY-MM``, or ``YYYY-MM-DD``. Raises ValueError
+    on any other non-empty input so the MCP layer can surface a clear
+    error to the caller instead of silently returning empty results.
+    """
+    if value is None or value == "":
+        return value
+    if not isinstance(value, str):
+        raise ValueError(f"{field_name} must be a string")
+    value = value.strip()
+    if not _ISO_DATE_RE.match(value):
+        raise ValueError(
+            f"{field_name}={value!r} is not a valid ISO-8601 date "
+            f"(expected YYYY, YYYY-MM, or YYYY-MM-DD)"
+        )
+    return value
+
+
 def sanitize_content(value: str, max_length: int = 100_000) -> str:
     """Validate drawer/diary content length."""
     if not isinstance(value, str) or not value.strip():
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 43897c8..8aecd05 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -55,6 +55,7 @@
     sanitize_kg_value,
     sanitize_name,
     sanitize_content,
+    sanitize_iso_date,
 )
 from .version import __version__  # noqa: E402
 from .backends.chroma import (  # noqa: E402
@@ -1021,6 +1022,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
     """Query the knowledge graph for an entity's relationships."""
     try:
         entity = sanitize_kg_value(entity, "entity")
+        as_of = sanitize_iso_date(as_of, "as_of")
     except ValueError as e:
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
@@ -1037,6 +1039,7 @@ def tool_kg_add(
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
+        valid_from = sanitize_iso_date(valid_from, "valid_from")
     except ValueError as e:
         return {"success": False, "error": str(e)}
 
@@ -1062,6 +1065,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
+        ended = sanitize_iso_date(ended, "ended")
     except ValueError as e:
         return {"success": False, "error": str(e)}
     _wal_log(
diff --git a/tests/test_config.py b/tests/test_config.py
index d7707d9..f5064e2 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -3,7 +3,13 @@
 import tempfile
 
 import pytest
-from mempalace.config import MempalaceConfig, normalize_wing_name, sanitize_kg_value, sanitize_name
+from mempalace.config import (
+    MempalaceConfig,
+    normalize_wing_name,
+    sanitize_iso_date,
+    sanitize_kg_value,
+    sanitize_name,
+)
 
 
 def test_default_config():
@@ -212,3 +218,65 @@ def test_kg_value_rejects_null_bytes():
 def test_kg_value_rejects_over_length():
     with pytest.raises(ValueError):
         sanitize_kg_value("a" * 129)
+
+
+# --- sanitize_iso_date ---
+
+
+def test_iso_date_accepts_year_only():
+    assert sanitize_iso_date("2026") == "2026"
+
+
+def test_iso_date_accepts_year_month():
+    assert sanitize_iso_date("2026-03") == "2026-03"
+
+
+def test_iso_date_accepts_full_date():
+    assert sanitize_iso_date("2026-03-15") == "2026-03-15"
+
+
+def test_iso_date_passes_through_none():
+    assert sanitize_iso_date(None) is None
+
+
+def test_iso_date_passes_through_empty_string():
+    assert sanitize_iso_date("") == ""
+
+
+def test_iso_date_strips_whitespace():
+    assert sanitize_iso_date("  2026-03-15  ") == "2026-03-15"
+
+
+def test_iso_date_rejects_natural_language():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("March 2026")
+
+
+def test_iso_date_rejects_abbreviated_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("Jan 2025")
+
+
+def test_iso_date_rejects_us_format():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("03/15/2026")
+
+
+def test_iso_date_rejects_invalid_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-13")
+
+
+def test_iso_date_rejects_invalid_day():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-02-32")
+
+
+def test_iso_date_rejects_non_string():
+    with pytest.raises(ValueError):
+        sanitize_iso_date(20260315)
+
+
+def test_iso_date_error_names_field():
+    with pytest.raises(ValueError, match="valid_from"):
+        sanitize_iso_date("yesterday", "valid_from")
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 480b6bd..1b80f36 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -665,6 +665,52 @@ def test_kg_stats(self, monkeypatch, config, palace_path, seeded_kg):
         result = tool_kg_stats()
         assert result["entities"] >= 4
 
+    # --- Date validation at the MCP boundary (issue #1164) ---
+
+    def test_kg_add_rejects_invalid_valid_from(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="Alice",
+            predicate="likes",
+            object="coffee",
+            valid_from="Jan 2025",
+        )
+        assert result["success"] is False
+        assert "valid_from" in result["error"]
+        assert "ISO-8601" in result["error"]
+
+    def test_kg_query_rejects_invalid_as_of(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        result = tool_kg_query(entity="Max", as_of="March 2026")
+        assert "error" in result
+        assert "as_of" in result["error"]
+
+    def test_kg_invalidate_rejects_invalid_ended(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        result = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="chess",
+            ended="yesterday",
+        )
+        assert result["success"] is False
+        assert "ended" in result["error"]
+
+    def test_kg_query_accepts_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_query
+
+        # YYYY and YYYY-MM are valid ISO-8601 forms — must not be rejected.
+        for value in ("2026", "2026-03", "2026-03-15"):
+            result = tool_kg_query(entity="Max", as_of=value)
+            assert "error" not in result, f"rejected valid date {value!r}: {result}"
+
 
 # ── Diary Tools ─────────────────────────────────────────────────────────
 

From abe85763d4da974b8652bdfe3654bee3a7534034 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 12:50:43 +0200
Subject: [PATCH 008/127] fix(kg): reject partial ISO dates to avoid silent
 empty result sets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per qodo-ai review on PR #1167: sanitize_iso_date() previously accepted
YYYY and YYYY-MM, but KnowledgeGraph.query_entity() compares valid_from/
valid_to TEXT columns lexicographically against as_of. Lexicographic
comparison treats '2026-01-01' as greater than '2026' (because '-' >
end-of-string), so partial as_of values silently excluded valid facts —
re-introducing the silent-empty-results problem this PR was meant to
fix.

Tighten _ISO_DATE_RE to require YYYY-MM-DD only. Update docstring and
error message accordingly. Invert the two test cases that asserted
partials were accepted.
---
 mempalace/config.py      | 18 +++++++++++-------
 tests/test_config.py     | 12 ++++++++----
 tests/test_mcp_server.py | 15 +++++++++++----
 3 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index 4005779..2252a49 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -85,16 +85,21 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
 # (as_of, valid_from, valid_to, ended). Parameterized queries already
 # prevent SQL injection, but unvalidated date strings silently miss
 # every row — callers cannot distinguish "no fact at this time" from
-# "your date format was unrecognized." Accept YYYY, YYYY-MM, YYYY-MM-DD.
-_ISO_DATE_RE = re.compile(r"^\d{4}(?:-(?:0[1-9]|1[0-2])(?:-(?:0[1-9]|[12]\d|3[01]))?)?$")
+# "your date format was unrecognized." Require full YYYY-MM-DD: KG
+# queries compare TEXT dates lexicographically, so partials like "2026"
+# would re-introduce silent empty results (e.g. "2026-01-01" <= "2026"
+# is False), defeating the purpose of validation.
+_ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
 
 
 def sanitize_iso_date(value, field_name: str = "date"):
     """Validate an ISO-8601 date string, accepting None or empty as-is.
 
-    Accepts ``YYYY``, ``YYYY-MM``, or ``YYYY-MM-DD``. Raises ValueError
-    on any other non-empty input so the MCP layer can surface a clear
-    error to the caller instead of silently returning empty results.
+    Accepts only ``YYYY-MM-DD``. Raises ValueError on any other
+    non-empty input so the MCP layer can surface a clear error to the
+    caller instead of silently returning empty results. Partial dates
+    (``YYYY``, ``YYYY-MM``) are rejected because KG queries compare
+    TEXT dates lexicographically and would silently exclude valid facts.
     """
     if value is None or value == "":
         return value
@@ -103,8 +108,7 @@ def sanitize_iso_date(value, field_name: str = "date"):
     value = value.strip()
     if not _ISO_DATE_RE.match(value):
         raise ValueError(
-            f"{field_name}={value!r} is not a valid ISO-8601 date "
-            f"(expected YYYY, YYYY-MM, or YYYY-MM-DD)"
+            f"{field_name}={value!r} is not a valid ISO-8601 date " f"(expected YYYY-MM-DD)"
         )
     return value
 
diff --git a/tests/test_config.py b/tests/test_config.py
index f5064e2..204faae 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -223,12 +223,16 @@ def test_kg_value_rejects_over_length():
 # --- sanitize_iso_date ---
 
 
-def test_iso_date_accepts_year_only():
-    assert sanitize_iso_date("2026") == "2026"
+def test_iso_date_rejects_year_only():
+    # Partial dates re-introduce silent empty result sets via lexicographic
+    # TEXT comparison in KG queries (e.g. "2026-01-01" <= "2026" is False).
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026")
 
 
-def test_iso_date_accepts_year_month():
-    assert sanitize_iso_date("2026-03") == "2026-03"
+def test_iso_date_rejects_year_month():
+    with pytest.raises(ValueError):
+        sanitize_iso_date("2026-03")
 
 
 def test_iso_date_accepts_full_date():
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 1b80f36..136b6f3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -702,14 +702,21 @@ def test_kg_invalidate_rejects_invalid_ended(self, monkeypatch, config, palace_p
         assert result["success"] is False
         assert "ended" in result["error"]
 
-    def test_kg_query_accepts_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
+    def test_kg_query_rejects_partial_iso_dates(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)
         from mempalace.mcp_server import tool_kg_query
 
-        # YYYY and YYYY-MM are valid ISO-8601 forms — must not be rejected.
-        for value in ("2026", "2026-03", "2026-03-15"):
+        # Partial ISO dates are rejected: KG queries compare TEXT dates
+        # lexicographically, so "2026-01-01" <= "2026" is False, which
+        # silently excludes facts. Reject at the boundary — only YYYY-MM-DD
+        # produces correct results.
+        for value in ("2026", "2026-03"):
             result = tool_kg_query(entity="Max", as_of=value)
-            assert "error" not in result, f"rejected valid date {value!r}: {result}"
+            assert "error" in result, f"accepted partial date {value!r}: {result}"
+
+        # Full ISO-8601 dates still pass.
+        result = tool_kg_query(entity="Max", as_of="2026-03-15")
+        assert "error" not in result, f"rejected valid date: {result}"
 
 
 # ── Diary Tools ─────────────────────────────────────────────────────────

From 10a0bc1a2b5004bd5866ffa9abaaf0f4520fd4a1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 1 May 2026 08:55:01 +0000
Subject: [PATCH 009/127] chore(deps): bump actions/configure-pages from 5 to 6

Bumps [actions/configure-pages](https://github.com/actions/configure-pages) from 5 to 6.
- [Release notes](https://github.com/actions/configure-pages/releases)
- [Commits](https://github.com/actions/configure-pages/compare/v5...v6)

---
updated-dependencies:
- dependency-name: actions/configure-pages
  dependency-version: '6'
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/deploy-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index b2b0d34..db10488 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -29,7 +29,7 @@ jobs:
 
       - name: Configure GitHub Pages
         id: pages
-        uses: actions/configure-pages@v5
+        uses: actions/configure-pages@v6
 
       - uses: oven-sh/setup-bun@v2
         with:

From c3e1104e75d5310aeb2147c999fb19af8a9b1dcc Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 010/127] fix(chroma): harden HNSW startup preflight

---
 mempalace/backends/chroma.py | 131 ++++++++++++++--
 tests/test_backends.py       | 283 ++++++++++++++++++++++++++++++++++-
 2 files changed, 395 insertions(+), 19 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 01ac627..646969b 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -3,7 +3,9 @@
 import datetime as _dt
 import logging
 import os
+import pickle
 import sqlite3
+from numbers import Integral
 from pathlib import Path
 from typing import Any, Optional
 
@@ -591,6 +593,97 @@ def _pin_hnsw_threads(collection) -> None:
 _BLOB_FIX_MARKER = ".blob_seq_ids_migrated"
 
 
+def _valid_dimensionality(value: object) -> bool:
+    return isinstance(value, Integral) and not isinstance(value, bool) and int(value) > 0
+
+
+def _persisted_metadata_fields(obj: object) -> tuple[object, object]:
+    if isinstance(obj, dict):
+        return obj.get("dimensionality"), obj.get("id_to_label")
+    return getattr(obj, "dimensionality", None), getattr(obj, "id_to_label", None)
+
+
+def quarantine_invalid_hnsw_metadata(palace_path: str) -> list[str]:
+    """Quarantine segment dirs whose ``index_metadata.pickle`` is unreadable or invalid.
+
+    Chroma's persisted HNSW metadata is untrusted disk state. If a segment has
+    labels but no valid positive dimensionality, current Chroma versions can
+    accept the pickle and crash later in the Rust loader. We rename the entire
+    segment out of the way before ``PersistentClient`` opens so Chroma can
+    rebuild cleanly instead of touching known-bad metadata.
+    """
+    try:
+        entries = os.listdir(palace_path)
+    except OSError:
+        return []
+
+    moved: list[str] = []
+    for name in entries:
+        if "-" not in name or name.startswith(".") or ".drift-" in name or ".corrupt-" in name:
+            continue
+        seg_dir = os.path.join(palace_path, name)
+        if not os.path.isdir(seg_dir):
+            continue
+
+        meta_path = os.path.join(seg_dir, "index_metadata.pickle")
+        if not os.path.isfile(meta_path):
+            continue
+
+        reason = None
+        try:
+            persisted = _SafePersistentDataUnpickler.load(meta_path)
+        except (EOFError, OSError):
+            logger.debug(
+                "Skipping invalid-HNSW quarantine for transient metadata read in %s",
+                meta_path,
+                exc_info=True,
+            )
+            continue
+        except pickle.UnpicklingError as exc:
+            if "truncated" in str(exc).lower() or "ran out of input" in str(exc).lower():
+                logger.debug(
+                    "Skipping invalid-HNSW quarantine for transient metadata read in %s",
+                    meta_path,
+                    exc_info=True,
+                )
+                continue
+            reason = f"invalid index_metadata.pickle: {exc}"
+        except Exception as exc:
+            reason = f"invalid index_metadata.pickle: {exc}"
+        else:
+            if not isinstance(persisted, dict) and not (
+                hasattr(persisted, "dimensionality") or hasattr(persisted, "id_to_label")
+            ):
+                reason = f"unrecognized index_metadata.pickle payload: {type(persisted).__name__}"
+            else:
+                dimensionality, id_to_label = _persisted_metadata_fields(persisted)
+                if id_to_label is not None and not isinstance(id_to_label, dict):
+                    reason = f"invalid id_to_label type {type(id_to_label).__name__}"
+                else:
+                    has_labels = bool(id_to_label)
+                    if has_labels and not _valid_dimensionality(dimensionality):
+                        reason = (
+                            "labels present but dimensionality is missing or invalid "
+                            f"({dimensionality!r})"
+                        )
+                    elif dimensionality is not None and not _valid_dimensionality(dimensionality):
+                        reason = f"invalid dimensionality {dimensionality!r}"
+
+        if reason is None:
+            continue
+
+        stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
+        target = f"{seg_dir}.corrupt-{stamp}"
+        try:
+            os.rename(seg_dir, target)
+            moved.append(target)
+            logger.warning("Quarantined invalid HNSW metadata in %s: %s", seg_dir, reason)
+        except OSError:
+            logger.exception("Failed to quarantine invalid HNSW metadata in %s", seg_dir)
+
+    return moved
+
+
 def _fix_blob_seq_ids(palace_path: str) -> None:
     """Fix ChromaDB 0.6.x -> 1.5.x migration bug: BLOB seq_ids -> INTEGER.
 
@@ -994,6 +1087,12 @@ def _client(self, palace_path: str):
 
         if cached is None or inode_changed or mtime_changed or mtime_appeared:
             _fix_blob_seq_ids(palace_path)
+            if inode_changed:
+                ChromaBackend._quarantined_paths.discard(palace_path)
+            if palace_path not in ChromaBackend._quarantined_paths:
+                quarantine_invalid_hnsw_metadata(palace_path)
+                quarantine_stale_hnsw(palace_path)
+                ChromaBackend._quarantined_paths.add(palace_path)
             cached = chromadb.PersistentClient(path=palace_path)
             self._clients[palace_path] = cached
             # Re-stat after the client constructor runs: chromadb creates
@@ -1006,26 +1105,27 @@ def _client(self, palace_path: str):
     # Public static helpers (legacy; prefer :meth:`get_collection`)
     # ------------------------------------------------------------------
 
-    # Per-process record of palaces that have already had quarantine_stale_hnsw
-    # invoked at least once. The proactive drift check is a *cold-start*
-    # protection — it catches HNSW segments that arrived stale relative to
-    # ``chroma.sqlite3`` (e.g. cross-machine replication, partial restore,
-    # crashed-mid-write). Once a long-running process has opened the palace
-    # cleanly, re-firing on every reconnect is a *runtime thrash*: the
-    # daemon's own writes bump sqlite mtime but HNSW flushes batch on
-    # chromadb's internal cadence, so the mtime gap naturally exceeds the
-    # threshold under steady write load even though nothing is corrupt.
+    # Per-process record of palaces that have already had the cold-start
+    # quarantine invoked at least once. The proactive HNSW checks are a
+    # *cold-start* protection — they catch segments that arrive stale relative
+    # to ``chroma.sqlite3`` or invalid on disk (e.g. cross-machine replication,
+    # partial restore, crashed-mid-write). Once a long-running process has
+    # opened the palace cleanly, re-firing the stale check on every reconnect
+    # is a *runtime thrash*: the daemon's own writes bump sqlite mtime but HNSW
+    # flushes batch on chromadb's internal cadence, so the mtime gap naturally
+    # exceeds the threshold under steady write load even though nothing is
+    # corrupt.
     # Real runtime drift is still handled — palace-daemon's ``_auto_repair``
     # calls :func:`quarantine_stale_hnsw` directly on observed HNSW errors,
     # which bypasses this gate.
     #
     # Thread-safety: this set is mutated without a lock. Two concurrent
     # ``make_client()`` calls for the same palace can both pass the
-    # membership check and both invoke ``quarantine_stale_hnsw``. That's
-    # safe because the function is idempotent (mtime check + timestamped
-    # rename of distinct directories), so the worst-case race produces
-    # one redundant rename attempt that no-ops. Idempotency is the
-    # safety property; locking would add cost without correctness gain.
+    # membership check and both invoke the cold-start quarantine. That's
+    # safe because the functions are idempotent (mtime checks + timestamped
+    # rename of distinct directories), so the worst-case race produces one
+    # redundant rename attempt that no-ops. Idempotency is the safety
+    # property; locking would add cost without correctness gain.
     _quarantined_paths: set[str] = set()
 
     @staticmethod
@@ -1036,12 +1136,13 @@ def make_client(palace_path: str):
         own client cache. New code should obtain a collection through
         :meth:`get_collection` which manages caching internally.
 
-        Quarantines stale HNSW segments **once per palace per process**. See
+        Quarantines HNSW segments **once per palace per process**. See
         :attr:`_quarantined_paths` for the rationale (cold-start protection
         vs. runtime thrash on steady-write daemons).
         """
         _fix_blob_seq_ids(palace_path)
         if palace_path not in ChromaBackend._quarantined_paths:
+            quarantine_invalid_hnsw_metadata(palace_path)
             quarantine_stale_hnsw(palace_path)
             ChromaBackend._quarantined_paths.add(palace_path)
         return chromadb.PersistentClient(path=palace_path)
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 5efa71b..cbbcdef 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,4 +1,5 @@
 import os
+import pickle
 import sqlite3
 from pathlib import Path
 
@@ -18,6 +19,7 @@
     ChromaCollection,
     _fix_blob_seq_ids,
     _pin_hnsw_threads,
+    quarantine_invalid_hnsw_metadata,
     quarantine_stale_hnsw,
 )
 
@@ -708,7 +710,10 @@ def test_make_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeyp
     """Quarantine fires on first ``make_client()`` for a palace, then is
     skipped on subsequent calls — prevents runtime thrash where a daemon's
     own steady writes bump ``chroma.sqlite3`` faster than HNSW flushes,
-    making the mtime heuristic falsely trigger every reconnect."""
+    making the mtime heuristic falsely trigger every reconnect.
+
+    Invalid metadata quarantine shares the same cold-start gate here; the
+    more aggressive refresh path lives in ``_client()``."""
     from mempalace.backends.chroma import ChromaBackend
 
     palace_path = str(tmp_path / "palace")
@@ -730,9 +735,37 @@ def _spy(path, stale_seconds=300.0):
     ChromaBackend.make_client(palace_path)
     ChromaBackend.make_client(palace_path)
 
-    assert calls == [
-        palace_path
-    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    assert calls == [palace_path], (
+        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    )
+
+
+def test_make_client_gates_invalid_metadata_on_first_call(tmp_path, monkeypatch):
+    """Invalid metadata quarantine is gated on the first make_client() call."""
+    from mempalace.backends.chroma import ChromaBackend
+
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _invalid(path, *args, **kwargs):
+        calls.append(path)
+        return []
+
+    def _stale(path, stale_seconds=300.0):
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _invalid)
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _stale)
+
+    ChromaBackend.make_client(palace_path)
+    ChromaBackend.make_client(palace_path)
+
+    assert calls == [palace_path]
 
 
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
@@ -811,3 +844,245 @@ def test_get_collection_applies_retrofit_on_existing_palace(tmp_path):
     )
 
     assert wrapper._collection.configuration_json["hnsw"]["num_threads"] == 1
+
+
+def test_quarantine_invalid_hnsw_metadata_renames_missing_dimensionality(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {"a": 1}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_allows_uninitialized_segment(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": None, "id_to_label": {}}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_dict_id_to_label(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump({"dimensionality": 8, "id_to_label": ["a", "b"]}, f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_non_schema_payload(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(["not", "a", "metadata", "object"], f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def _dangerous_pickle_payload_executed():
+    raise AssertionError("unsafe pickle payload executed")
+
+
+class _DangerousPickle:
+    def __reduce__(self):
+        return (_dangerous_pickle_payload_executed, ())
+
+
+def test_quarantine_invalid_hnsw_metadata_rejects_unsafe_pickle(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    with open(seg / "index_metadata.pickle", "wb") as f:
+        pickle.dump(_DangerousPickle(), f)
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert len(moved) == 1
+    assert ".corrupt-" in moved[0]
+    assert not seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_transient_read_errors(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(EOFError("flush in progress")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_quarantine_invalid_hnsw_metadata_skips_truncated_pickle(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    seg = palace / "abcd-1234-5678"
+    seg.mkdir()
+    meta = seg / "index_metadata.pickle"
+    meta.write_bytes(b"partial")
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma._SafePersistentDataUnpickler.load",
+        lambda path: (_ for _ in ()).throw(pickle.UnpicklingError("pickle data was truncated")),
+    )
+
+    moved = quarantine_invalid_hnsw_metadata(str(palace))
+
+    assert moved == []
+    assert seg.exists()
+
+
+def test_chroma_backend_preflights_metadata_before_persistent_client(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]
+
+
+def test_chroma_backend_stale_quarantine_is_cold_start_only_on_refresh(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (1, 2.0), (1, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+    ]
+
+
+def test_chroma_backend_requarantines_after_inode_replacement(tmp_path, monkeypatch):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "chroma.sqlite3").write_text("")
+    calls = []
+
+    def _record(name):
+        def inner(path, *args, **kwargs):
+            calls.append((name, path))
+            return [] if name != "blob" else None
+
+        return inner
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+    monkeypatch.setattr("mempalace.backends.chroma._fix_blob_seq_ids", _record("blob"))
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.quarantine_invalid_hnsw_metadata", _record("invalid")
+    )
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _record("stale"))
+
+    class DummyClient:
+        pass
+
+    monkeypatch.setattr(
+        "mempalace.backends.chroma.chromadb.PersistentClient", lambda path: DummyClient()
+    )
+
+    backend = ChromaBackend()
+    stats = iter([(1, 1.0), (1, 1.0), (2, 2.0), (2, 2.0)])
+    monkeypatch.setattr(backend, "_db_stat", lambda path: next(stats))
+
+    backend._client(str(palace))
+    backend._client(str(palace))
+
+    assert calls == [
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+        ("blob", str(palace)),
+        ("invalid", str(palace)),
+        ("stale", str(palace)),
+    ]

From 7fa27bd23101f0c7dc4fa33f84be0b1d9710dbe2 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 011/127] fix(repair): rebuild collections through temp staging

---
 mempalace/repair.py  | 159 +++++++++++++++++-----
 tests/test_repair.py | 314 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 436 insertions(+), 37 deletions(-)

diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..49d6abe 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -37,10 +37,13 @@
 from datetime import datetime
 from typing import Optional
 
+from chromadb.errors import NotFoundError as ChromaNotFoundError
+
 from .backends.chroma import ChromaBackend, hnsw_capacity_status
 
 
 COLLECTION_NAME = "mempalace_drawers"
+REPAIR_TEMP_COLLECTION = f"{COLLECTION_NAME}__repair_tmp"
 
 
 def _get_palace_path():
@@ -83,6 +86,108 @@ def _paginate_ids(col, where=None):
     return ids
 
 
+def _extract_drawers(col, total: int, batch_size: int):
+    all_ids = []
+    all_docs = []
+    all_metas = []
+    offset = 0
+    while offset < total:
+        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
+        if not batch["ids"]:
+            break
+        all_ids.extend(batch["ids"])
+        all_docs.extend(batch["documents"])
+        all_metas.extend(batch["metadatas"])
+        offset += len(batch["ids"])
+    return all_ids, all_docs, all_metas
+
+
+def _verify_collection_count(col, expected: int, label: str) -> None:
+    actual = col.count()
+    if actual != expected:
+        raise RuntimeError(f"{label} count mismatch: expected {expected}, got {actual}")
+
+
+def _is_missing_collection_value_error(exc: ValueError) -> bool:
+    message = str(exc).lower()
+    return "does not exist" in message or "not found" in message
+
+
+def _delete_collection_if_exists(backend, palace_path: str, collection_name: str) -> None:
+    try:
+        backend.delete_collection(palace_path, collection_name)
+    except ValueError as exc:
+        if _is_missing_collection_value_error(exc):
+            return
+        raise
+    except (FileNotFoundError, ChromaNotFoundError):
+        return
+
+
+class RebuildCollectionError(RuntimeError):
+    """Raised when temp rebuild fails, carrying whether the live swap happened."""
+
+    def __init__(self, message: str, *, live_replaced: bool):
+        super().__init__(message)
+        self.live_replaced = live_replaced
+
+
+def _rebuild_collection_via_temp(
+    backend,
+    palace_path: str,
+    all_ids,
+    all_docs,
+    all_metas,
+    batch_size: int,
+    progress=print,
+) -> int:
+    expected = len(all_ids)
+    temp_name = REPAIR_TEMP_COLLECTION
+    live_replaced = False
+
+    try:
+        _delete_collection_if_exists(backend, palace_path, temp_name)
+
+        progress(f"  Building temporary collection: {temp_name}")
+        temp_col = backend.create_collection(palace_path, temp_name)
+        staged = 0
+        for i in range(0, expected, batch_size):
+            batch_ids = all_ids[i : i + batch_size]
+            batch_docs = all_docs[i : i + batch_size]
+            batch_metas = all_metas[i : i + batch_size]
+            temp_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
+            staged += len(batch_ids)
+            progress(f"  Staged {staged}/{expected} drawers...")
+        _verify_collection_count(temp_col, expected, "temporary rebuild")
+
+        progress("  Rebuilding live collection...")
+        backend.delete_collection(palace_path, COLLECTION_NAME)
+        live_replaced = True
+        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+
+        rebuilt = 0
+        for i in range(0, expected, batch_size):
+            batch_ids = all_ids[i : i + batch_size]
+            batch_docs = all_docs[i : i + batch_size]
+            batch_metas = all_metas[i : i + batch_size]
+            new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
+            rebuilt += len(batch_ids)
+            progress(f"  Re-filed {rebuilt}/{expected} drawers...")
+        _verify_collection_count(new_col, expected, "rebuilt live collection")
+
+        try:
+            _delete_collection_if_exists(backend, palace_path, temp_name)
+        except Exception:
+            pass
+        return rebuilt
+    except Exception as exc:
+        try:
+            _delete_collection_if_exists(backend, palace_path, temp_name)
+        except Exception:
+            pass
+        raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
+
+
 def scan_palace(palace_path=None, only_wing=None):
     """Scan the palace for corrupt/unfetchable IDs.
 
@@ -373,18 +478,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     # Extract all drawers in batches
     print("\n  Extracting drawers...")
     batch_size = 5000
-    all_ids = []
-    all_docs = []
-    all_metas = []
-    offset = 0
-    while offset < total:
-        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
-        if not batch["ids"]:
-            break
-        all_ids.extend(batch["ids"])
-        all_docs.extend(batch["documents"])
-        all_metas.extend(batch["metadatas"])
-        offset += len(batch["ids"])
+    all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
     print(f"  Extracted {len(all_ids)} drawers")
 
     # ── #1208 guard ──────────────────────────────────────────────────
@@ -407,28 +501,33 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
 
     # Rebuild with correct HNSW settings
     print("  Rebuilding collection with hnsw:space=cosine...")
-    backend.delete_collection(palace_path, COLLECTION_NAME)
-    new_col = backend.create_collection(palace_path, COLLECTION_NAME)
-
-    filed = 0
     try:
-        for i in range(0, len(all_ids), batch_size):
-            batch_ids = all_ids[i : i + batch_size]
-            batch_docs = all_docs[i : i + batch_size]
-            batch_metas = all_metas[i : i + batch_size]
-            new_col.upsert(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
-            filed += len(batch_ids)
-            print(f"  Re-filed {filed}/{len(all_ids)} drawers...")
-    except Exception as e:
+        filed = _rebuild_collection_via_temp(
+            backend,
+            palace_path,
+            all_ids,
+            all_docs,
+            all_metas,
+            batch_size,
+            progress=print,
+        )
+    except RebuildCollectionError as e:
         print(f"\n  ERROR during rebuild: {e}")
-        print(f"  Only {filed}/{len(all_ids)} drawers were re-filed.")
-        if os.path.exists(backup_path):
+        print("  Rebuild aborted before completion.")
+        if e.live_replaced and os.path.exists(backup_path):
             print(f"  Restoring from backup: {backup_path}")
-            backend.delete_collection(palace_path, COLLECTION_NAME)
-            shutil.copy2(backup_path, sqlite_path)
-            print("  Backup restored. Palace is back to pre-repair state.")
-        else:
+            try:
+                _close_chroma_handles(palace_path)
+                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                shutil.copy2(backup_path, sqlite_path)
+                print("  Backup restored. Palace is back to pre-repair state.")
+            except Exception as restore_error:
+                print(f"  Backup restore failed: {restore_error}")
+                print(f"  Manual restore required from: {backup_path}")
+        elif e.live_replaced:
             print("  No backup available. Re-mine from source files to recover.")
+        else:
+            print("  Live collection was not replaced; leaving the original palace untouched.")
         raise
 
     print(f"\n  Repair complete. {filed} drawers rebuilt.")
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..33daad9 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -2,7 +2,7 @@
 
 import os
 import sqlite3
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import pytest
 
@@ -229,8 +229,11 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
     }
 
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
     mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
 
     repair.rebuild_index(palace_path=str(tmp_path))
 
@@ -239,14 +242,74 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
     assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
 
     # Verify: deleted and recreated (cosine is the backend default)
-    mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
-    mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
 
     # Verify: used upsert not add
+    mock_temp_col.upsert.assert_called_once()
     mock_new_col.upsert.assert_called_once()
     mock_new_col.add.assert_not_called()
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_missing_temp_collection_at_start(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        ValueError("Collection [mempalace_drawers__repair_tmp] does not exist"),
+        None,
+        None,
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+def test_delete_collection_if_exists_reraises_unexpected_value_error():
+    mock_backend = MagicMock()
+    mock_backend.delete_collection.side_effect = ValueError("invalid collection name")
+
+    with pytest.raises(ValueError, match="invalid collection name"):
+        repair._delete_collection_if_exists(mock_backend, "/palace", "bad/name")
+
+
 @patch("mempalace.repair.shutil")
 @patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_error_reading(mock_backend_cls, mock_shutil, tmp_path):
@@ -365,19 +428,256 @@ def test_rebuild_index_proceeds_with_override(mock_backend_cls, mock_shutil, tmp
         },
         {"ids": [], "documents": [], "metadatas": []},
     ]
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 10_000
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 10_000
     mock_backend.get_collection.return_value = mock_col
-    mock_backend.create_collection.return_value = mock_new_col
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
     mock_backend_cls.return_value = mock_backend
 
     with patch("mempalace.repair.sqlite_drawer_count", return_value=67_580):
         repair.rebuild_index(palace_path=str(tmp_path), confirm_truncation_ok=True)
 
-    mock_backend.delete_collection.assert_called_once()
-    mock_backend.create_collection.assert_called_once()
+    assert mock_backend.delete_collection.call_count == 3
+    assert mock_backend.create_collection.call_count == 2
+    mock_temp_col.upsert.assert_called()
     mock_new_col.upsert.assert_called()
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_stage_failure_leaves_live_collection_untouched(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 1
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.return_value = mock_temp_col
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is False
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_failure_restores_backup(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_live_delete_missing_still_restores_backup(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("create failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        None,
+        repair.ChromaNotFoundError("missing"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert excinfo.value.live_replaced is True
+    assert mock_shutil.copy2.call_count == 2
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_restore_failure_preserves_original_error(
+    mock_backend_cls, mock_shutil, tmp_path, capsys
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _copy2_side_effect(src, dst):
+        if str(src).endswith(".backup"):
+            raise PermissionError("locked sqlite")
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _copy2_side_effect
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    out = capsys.readouterr().out
+    assert "locked sqlite" in out
+    assert "Manual restore required" in out
+    assert "live upsert failed" in str(excinfo.value)
+
+
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_collection_via_temp_keeps_original_error_when_cleanup_fails(
+    mock_backend_cls,
+):
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    with pytest.raises(repair.RebuildCollectionError) as excinfo:
+        repair._rebuild_collection_via_temp(
+            mock_backend,
+            "/palace",
+            ["id1", "id2"],
+            ["doc1", "doc2"],
+            [{"wing": "a"}, {"wing": "b"}],
+            batch_size=5000,
+            progress=lambda *args, **kwargs: None,
+        )
+
+    assert "live build failed" in str(excinfo.value)
+    assert excinfo.value.live_replaced is True
+    assert mock_backend.delete_collection.call_args_list == [
+        call("/palace", "mempalace_drawers__repair_tmp"),
+        call("/palace", "mempalace_drawers"),
+        call("/palace", "mempalace_drawers__repair_tmp"),
+    ]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_ignores_temp_cleanup_failure_after_success(
+    mock_backend_cls, mock_shutil, tmp_path
+):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+
+    def _fake_copy2(src, dst):
+        with open(dst, "w") as handle:
+            handle.write("backup")
+
+    mock_shutil.copy2.side_effect = _fake_copy2
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    mock_backend.delete_collection.side_effect = [
+        None,
+        None,
+        RuntimeError("cleanup failed"),
+    ]
+
+    repair.rebuild_index(palace_path=str(tmp_path))
+
+    assert mock_shutil.copy2.call_count == 1
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+        call(str(tmp_path), "mempalace_drawers"),
+        call(str(tmp_path), "mempalace_drawers__repair_tmp"),
+    ]
+
+
 # ── repair_max_seq_id ─────────────────────────────────────────────────
 
 
From 2f509b4789d7ae44f8fbf1281df812e098207aa3 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:31:32 -0600
Subject: [PATCH 012/127] fix(cli): restore backup on repair failure

---
 mempalace/cli.py  | 62 +++++++++++++++++++++++++++--------------------
 tests/test_cli.py | 50 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 27 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index ca9798b..27c81d3 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -648,7 +648,14 @@ def cmd_repair(args):
     import shutil
     from .backends.chroma import ChromaBackend
     from .migrate import confirm_destructive_action, contains_palace_database
-    from .repair import TruncationDetected, check_extraction_safety
+    from .repair import (
+        RebuildCollectionError,
+        TruncationDetected,
+        _close_chroma_handles,
+        _extract_drawers,
+        _rebuild_collection_via_temp,
+        check_extraction_safety,
+    )
 
     palace_path = os.path.abspath(
         os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -705,18 +712,7 @@ def cmd_repair(args):
     # Extract all drawers in batches
     print("\n  Extracting drawers...")
     batch_size = 5000
-    all_ids = []
-    all_docs = []
-    all_metas = []
-    offset = 0
-    while offset < total:
-        batch = col.get(limit=batch_size, offset=offset, include=["documents", "metadatas"])
-        if not batch["ids"]:
-            break
-        all_ids.extend(batch["ids"])
-        all_docs.extend(batch["documents"])
-        all_metas.extend(batch["metadatas"])
-        offset += len(batch["ids"])
+    all_ids, all_docs, all_metas = _extract_drawers(col, total, batch_size)
     print(f"  Extracted {len(all_ids)} drawers")
 
     # ── #1208 guard ──────────────────────────────────────────────────
@@ -736,7 +732,6 @@ def cmd_repair(args):
         print(e.message)
         return
 
-    # Backup and rebuild
     palace_path = os.path.normpath(palace_path)
     backup_path = palace_path + ".backup"
     if os.path.exists(backup_path):
@@ -750,18 +745,33 @@ def cmd_repair(args):
     print(f"  Backing up to {backup_path}...")
     shutil.copytree(palace_path, backup_path)
 
-    print("  Rebuilding collection...")
-    backend.delete_collection(palace_path, "mempalace_drawers")
-    new_col = backend.create_collection(palace_path, "mempalace_drawers")
-
-    filed = 0
-    for i in range(0, len(all_ids), batch_size):
-        batch_ids = all_ids[i : i + batch_size]
-        batch_docs = all_docs[i : i + batch_size]
-        batch_metas = all_metas[i : i + batch_size]
-        new_col.add(documents=batch_docs, ids=batch_ids, metadatas=batch_metas)
-        filed += len(batch_ids)
-        print(f"  Re-filed {filed}/{len(all_ids)} drawers...")
+    try:
+        filed = _rebuild_collection_via_temp(
+            backend,
+            palace_path,
+            all_ids,
+            all_docs,
+            all_metas,
+            batch_size,
+            progress=print,
+        )
+    except RebuildCollectionError as e:
+        print(f"  Repair failed: {e}")
+        if getattr(e, "live_replaced", False):
+            print("  Live collection was already replaced; restoring from backup...")
+            try:
+                _close_chroma_handles(palace_path)
+                if os.path.exists(palace_path):
+                    shutil.rmtree(palace_path)
+                shutil.copytree(backup_path, palace_path)
+                print(f"  Restore complete from backup: {backup_path}")
+            except Exception as restore_error:
+                print(f"  Automatic restore failed: {restore_error}")
+                print("  Manual recovery required:")
+                print(f"    1. Remove or rename the broken directory: {palace_path}")
+                print(f"    2. Restore the backup directory to: {palace_path}")
+                print(f"       Backup location: {backup_path}")
+        sys.exit(1)
 
     print(f"\n  Repair complete. {filed} drawers rebuilt.")
     print(f"  Backup saved at {backup_path}")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index af7b39d..11845fe 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -4,7 +4,7 @@
 import shlex
 import sys
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, call, patch
 
 import pytest
 
@@ -760,13 +760,61 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
         "documents": ["doc1", "doc2"],
         "metadatas": [{"wing": "a"}, {"wing": "b"}],
     }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
     mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
     mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
     with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
         cmd_repair(args)
     out = capsys.readouterr().out
     assert "Repair complete" in out
     assert "2 drawers rebuilt" in out
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]
+    mock_temp_col.upsert.assert_called_once()
+    mock_new_col.upsert.assert_called_once()
+    mock_new_col.add.assert_not_called()
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
+    with (
+        patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend),
+        patch("mempalace.repair._close_chroma_handles") as mock_close_handles,
+    ):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    out = capsys.readouterr().out
+    assert excinfo.value.code == 1
+    assert "Repair failed" in out
+    assert "restoring from backup" in out
+    mock_close_handles.assert_called_once_with(str(palace_dir))
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+        call(str(palace_dir), "mempalace_drawers"),
+        call(str(palace_dir), "mempalace_drawers__repair_tmp"),
+    ]
 
 
 @patch("mempalace.cli.MempalaceConfig")

From f57f30025f2551efc3d917a368f5ed0dae336c46 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 09:57:39 -0600
Subject: [PATCH 013/127] fix(repair): close active backend before rollback
 restore

Rollback cleanup was instantiating a fresh ChromaBackend, so the live backend that had opened the PersistentClient could keep file handles alive during restore. Close the active backend instance instead so rollback and CLI recovery can release Windows-safe locks before copying the backup back into place.
---
 mempalace/cli.py     |  2 +-
 mempalace/repair.py  | 14 ++++++++++----
 tests/test_cli.py    |  7 ++-----
 tests/test_repair.py | 11 ++++++++---
 4 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 27c81d3..9a1e8e4 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -760,7 +760,7 @@ def cmd_repair(args):
         if getattr(e, "live_replaced", False):
             print("  Live collection was already replaced; restoring from backup...")
             try:
-                _close_chroma_handles(palace_path)
+                _close_chroma_handles(palace_path, backend=backend)
                 if os.path.exists(palace_path):
                     shutil.rmtree(palace_path)
                 shutil.copytree(backup_path, palace_path)
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 49d6abe..0585405 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -517,7 +517,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
         if e.live_replaced and os.path.exists(backup_path):
             print(f"  Restoring from backup: {backup_path}")
             try:
-                _close_chroma_handles(palace_path)
+                _close_chroma_handles(palace_path, backend=backend)
                 _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
                 shutil.copy2(backup_path, sqlite_path)
                 print("  Backup restored. Palace is back to pre-repair state.")
@@ -593,12 +593,18 @@ def status(palace_path=None) -> dict:
 # ---------------------------------------------------------------------------
 
 
-def _close_chroma_handles(palace_path: str) -> None:
-    """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release."""
+def _close_chroma_handles(palace_path: str, backend: ChromaBackend | None = None) -> None:
+    """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
+
+    When ``backend`` is provided, close the live instance so rollback/restore
+    releases the handles it was already using. Otherwise fall back to a
+    transient backend instance for the max-seq-id repair path.
+    """
     import gc
 
     try:
-        ChromaBackend().close_palace(palace_path)
+        closer = backend if backend is not None else ChromaBackend()
+        closer.close_palace(palace_path)
     except Exception:
         pass
     try:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 11845fe..6572f1d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -799,17 +799,14 @@ def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp
     mock_temp_col.count.return_value = 2
     mock_backend = _mock_backend_for(col=mock_col)
     mock_backend.create_collection.side_effect = [mock_temp_col, RuntimeError("live build failed")]
-    with (
-        patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend),
-        patch("mempalace.repair._close_chroma_handles") as mock_close_handles,
-    ):
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
         with pytest.raises(SystemExit) as excinfo:
             cmd_repair(args)
     out = capsys.readouterr().out
     assert excinfo.value.code == 1
     assert "Repair failed" in out
     assert "restoring from backup" in out
-    mock_close_handles.assert_called_once_with(str(palace_dir))
+    mock_backend.close_palace.assert_called_once_with(str(palace_dir))
     assert mock_backend.delete_collection.call_args_list == [
         call(str(palace_dir), "mempalace_drawers__repair_tmp"),
         call(str(palace_dir), "mempalace_drawers"),
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 33daad9..9cd12dd 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -499,20 +499,25 @@ def _fake_copy2(src, dst):
     mock_temp_col.count.return_value = 2
     mock_new_col = MagicMock()
     mock_new_col.upsert.side_effect = RuntimeError("live upsert failed")
-    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
-    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    active_backend = MagicMock()
+    active_backend.get_collection.return_value = mock_col
+    active_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+    helper_backend = MagicMock()
+    mock_backend_cls.side_effect = [active_backend, helper_backend]
 
     with pytest.raises(repair.RebuildCollectionError) as excinfo:
         repair.rebuild_index(palace_path=str(tmp_path))
 
     assert excinfo.value.live_replaced is True
     assert mock_shutil.copy2.call_count == 2
-    assert mock_backend.delete_collection.call_args_list == [
+    assert active_backend.delete_collection.call_args_list == [
         call(str(tmp_path), "mempalace_drawers__repair_tmp"),
         call(str(tmp_path), "mempalace_drawers"),
         call(str(tmp_path), "mempalace_drawers__repair_tmp"),
         call(str(tmp_path), "mempalace_drawers"),
     ]
+    active_backend.close_palace.assert_called_once_with(str(tmp_path))
+    helper_backend.close_palace.assert_not_called()
 
 
 @patch("mempalace.repair.shutil")

From 0e32b9643c3d4ceca36b6f610ddbcbbf58ad8dd7 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Thu, 30 Apr 2026 21:47:08 -0600
Subject: [PATCH 014/127] fix: avoid false hnsw divergence fallback

---
 mempalace/backends/chroma.py |  27 +++---
 mempalace/searcher.py        |  43 ++++++++--
 tests/test_hnsw_capacity.py  | 160 ++++++++++++++++++++++++++++++++++-
 3 files changed, 204 insertions(+), 26 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 646969b..7ce8fd8 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -491,22 +491,17 @@ def hnsw_capacity_status(palace_path: str, collection_name: str = "mempalace_dra
         divergence_floor = max(_HNSW_DIVERGENCE_FALLBACK_FLOOR, 2 * sync_threshold)
 
         if hnsw_count is None:
-            # No pickle yet — segment hasn't persisted metadata. Could be
-            # fresh-but-unflushed (normal) or interrupted-mid-flush (bad).
-            # We can't distinguish without the pickle, so only flag
-            # divergence when sqlite holds clearly more than two flush
-            # windows worth — same threshold as the with-pickle path.
-            if sqlite_count > divergence_floor:
-                out["status"] = "diverged"
-                out["diverged"] = True
-                out["divergence"] = sqlite_count
-                out["message"] = (
-                    f"sqlite holds {sqlite_count:,} embeddings but the HNSW segment "
-                    "has never flushed metadata — vector search will return nothing "
-                    "until the segment is rebuilt. Run `mempalace repair`."
-                )
-            else:
-                out["message"] = "HNSW segment metadata not yet flushed; skipping"
+            # No pickle yet, so this probe cannot measure HNSW capacity.
+            # Chroma 1.5.x can have binary HNSW files without a flushed
+            # metadata pickle; absence of the pickle alone is not proof that
+            # vector search is unusable or dangerous. Keep the status unknown
+            # so MCP does not globally disable vectors on an inconclusive
+            # signal. Corrupt/invalid metadata, when present, is handled by
+            # quarantine_invalid_hnsw_metadata before Chroma opens.
+            out["message"] = (
+                "HNSW capacity unavailable: metadata has not been flushed; "
+                "leaving vector search enabled"
+            )
             return out
 
         divergence = sqlite_count - hnsw_count
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index a14d90d..4ff0f23 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -396,6 +396,31 @@ def _bm25_only_via_sqlite(
             "hint": "Run: mempalace init <dir> && mempalace mine <dir>",
         }
 
+    def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
+        clauses = []
+        params = []
+        for key, value in (("wing", wing), ("room", room)):
+            if not value:
+                continue
+            clauses.append(
+                f"""
+                AND EXISTS (
+                    SELECT 1
+                    FROM embedding_metadata mf
+                    WHERE mf.id = {row_id_expr}
+                      AND mf.key = ?
+                      AND COALESCE(
+                        mf.string_value,
+                        CAST(mf.int_value AS TEXT),
+                        CAST(mf.float_value AS TEXT),
+                        CAST(mf.bool_value AS TEXT)
+                      ) = ?
+                )
+                """
+            )
+            params.extend([key, value])
+        return "".join(clauses), params
+
     try:
         conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
     except sqlite3.Error as e:
@@ -408,15 +433,17 @@ def _bm25_only_via_sqlite(
         candidate_ids: list[int] = []
         if tokens:
             fts_query = " OR ".join(tokens)
+            filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
             try:
                 rows = conn.execute(
-                    """
+                    f"""
                     SELECT rowid
                     FROM embedding_fulltext_search
                     WHERE embedding_fulltext_search MATCH ?
+                    {filter_sql}
                     LIMIT ?
                     """,
-                    (fts_query, max_candidates),
+                    (fts_query, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -434,17 +461,19 @@ def _bm25_only_via_sqlite(
             # fall back to ordering by primary-key id and finally to an
             # empty result rather than letting search raise.
             try:
+                filter_sql, filter_params = _metadata_filter_sql("e.id")
                 rows = conn.execute(
-                    """
+                    f"""
                     SELECT e.id
                     FROM embeddings e
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
                     WHERE c.name = 'mempalace_drawers'
+                    {filter_sql}
                     ORDER BY e.created_at DESC
                     LIMIT ?
                     """,
-                    (max_candidates,),
+                    (*filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -453,17 +482,19 @@ def _bm25_only_via_sqlite(
                     exc_info=True,
                 )
                 try:
+                    filter_sql, filter_params = _metadata_filter_sql("e.id")
                     rows = conn.execute(
-                        """
+                        f"""
                         SELECT e.id
                         FROM embeddings e
                         JOIN segments s ON e.segment_id = s.id
                         JOIN collections c ON s.collection = c.id
                         WHERE c.name = 'mempalace_drawers'
+                        {filter_sql}
                         ORDER BY e.id DESC
                         LIMIT ?
                         """,
-                        (max_candidates,),
+                        (*filter_params, max_candidates),
                     ).fetchall()
                     candidate_ids = [r[0] for r in rows]
                 except sqlite3.Error:
diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py
index 512fc9c..912def8 100644
--- a/tests/test_hnsw_capacity.py
+++ b/tests/test_hnsw_capacity.py
@@ -238,14 +238,39 @@ def test_capacity_status_tolerates_flush_lag(tmp_path):
     assert info["status"] == "ok"
 
 
-def test_capacity_status_flags_unflushed_with_large_sqlite(tmp_path):
-    """No pickle + many sqlite rows is its own divergence signal."""
+def test_capacity_status_does_not_flag_unflushed_with_large_sqlite(tmp_path):
+    """No pickle + many sqlite rows is inconclusive, not divergence."""
     seg = "seg-noflush"
     _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
     info = hnsw_capacity_status(str(tmp_path), COLLECTION)
-    assert info["diverged"] is True
+    assert info["diverged"] is False
+    assert info["status"] == "unknown"
+    assert info["divergence"] is None
     assert info["hnsw_count"] is None
-    assert "never flushed" in info["message"]
+    assert "capacity unavailable" in info["message"]
+    assert "leaving vector search enabled" in info["message"]
+
+
+def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, monkeypatch):
+    """The MCP preflight must not route all searches to BM25 on this signal."""
+    from mempalace import mcp_server
+
+    seg = "seg-mcp-noflush"
+    _seed_chroma_db(str(tmp_path), sqlite_count=10_000, segment_id=seg)
+
+    class _Cfg:
+        palace_path = str(tmp_path)
+
+    monkeypatch.setattr(mcp_server, "_config", _Cfg())
+    monkeypatch.setattr(mcp_server, "_vector_disabled", True)
+    monkeypatch.setattr(mcp_server, "_vector_disabled_reason", "old divergence")
+
+    mcp_server._refresh_vector_disabled_flag()
+
+    assert mcp_server._vector_disabled is False
+    assert mcp_server._vector_disabled_reason == ""
+    assert mcp_server._vector_capacity_status["status"] == "unknown"
+    assert "leaving vector search enabled" in mcp_server._vector_capacity_status["message"]
 
 
 def test_capacity_status_quiet_for_empty_palace(tmp_path):
@@ -372,6 +397,17 @@ def _seed_drawers(palace: str, segment_id: str, drawers: list[tuple[str, dict, s
         conn.close()
 
 
+def _set_drawer_created_at(palace: str, timestamps: dict[int, str]) -> None:
+    db_path = os.path.join(palace, "chroma.sqlite3")
+    conn = sqlite3.connect(db_path)
+    try:
+        for emb_id, created_at in timestamps.items():
+            conn.execute("UPDATE embeddings SET created_at = ? WHERE id = ?", (created_at, emb_id))
+        conn.commit()
+    finally:
+        conn.close()
+
+
 @pytest.fixture
 def palace_with_drawers(tmp_path):
     seg = "seg-bm25"
@@ -417,6 +453,122 @@ def test_bm25_fallback_filters_by_wing(palace_with_drawers):
     assert all(r["wing"] == "design" for r in out["results"])
 
 
+def test_bm25_fallback_applies_wing_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-fts-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+            (
+                "shared token inside target wing",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_applies_room_before_fts_candidate_limit(tmp_path):
+    seg = "seg-bm25-room-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token wrong room",
+                {"wing": "project", "room": "scratch", "source_file": "/x/scratch.md"},
+                "d-1",
+            ),
+            (
+                "shared token right room",
+                {"wing": "project", "room": "diary", "source_file": "/x/diary.md"},
+                "d-2",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite(
+        "shared token",
+        str(tmp_path),
+        wing="project",
+        room="diary",
+        max_candidates=1,
+    )
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+    assert out["results"][0]["room"] == "diary"
+
+
+def test_bm25_fallback_applies_wing_before_recency_candidate_limit(tmp_path):
+    seg = "seg-bm25-recency-limit"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "target drawer for short query",
+                {"wing": "project", "room": "diary", "source_file": "/x/project.md"},
+                "d-1",
+            ),
+            (
+                "newer drawer outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-2",
+            ),
+        ],
+    )
+    _set_drawer_created_at(
+        str(tmp_path),
+        {
+            1: "2026-01-01 00:00:00",
+            2: "2026-02-01 00:00:00",
+        },
+    )
+
+    out = _bm25_only_via_sqlite("a", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 1
+    assert len(out["results"]) == 1
+    assert out["results"][0]["wing"] == "project"
+
+
+def test_bm25_fallback_returns_empty_when_filtered_wing_has_no_candidates(tmp_path):
+    seg = "seg-bm25-empty-filter"
+    _seed_chroma_db(str(tmp_path), sqlite_count=0, segment_id=seg)
+    _seed_drawers(
+        str(tmp_path),
+        seg,
+        [
+            (
+                "shared token outside target wing",
+                {"wing": "ops", "room": "incidents", "source_file": "/x/ops.md"},
+                "d-1",
+            ),
+        ],
+    )
+
+    out = _bm25_only_via_sqlite("shared token", str(tmp_path), wing="project", max_candidates=1)
+
+    assert out["total_before_filter"] == 0
+    assert out["results"] == []
+
+
 def test_bm25_fallback_no_palace(tmp_path):
     out = _bm25_only_via_sqlite("anything", str(tmp_path))
     assert "error" in out

From ac6c2b6af6782668958732323969b78faa8b65be Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 1 May 2026 19:34:38 -0300
Subject: [PATCH 015/127] fix(mcp_server): pass embedding_function= on
 collection reopen (#1299)

`mcp_server._get_collection` bypassed `ChromaBackend.get_collection`
and called `client.get_collection` / `client.create_collection` without
`embedding_function=`. ChromaDB 1.x does not persist the EF identity
with the collection, so the MCP server's reopen silently bound
chromadb's built-in `DefaultEmbeddingFunction` while the miner / Stop
hook ingest path bound `mempalace.embedding.get_embedding_function()`.

On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple
Silicon, per #1299) the default EF's lazy ONNX provider selection could
SIGSEGV the host process on first `col.add()`, killing the MCP stdio
server and leaving every subsequent tool call returning
`Connection closed` until Claude Code was relaunched. Reads worked
because `col.get(ids=...)` and metadata fetches don't invoke the EF;
the auto-ingest path worked because mining routes through the backend
abstraction. Diary writes were the consistent failure surface.

Resolve the EF up front (matching `ChromaBackend._resolve_embedding_function`)
and pass it into both reopen branches. Falls back to the chromadb default
only if `mempalace.embedding.get_embedding_function` itself raises.

Regression test patches the chromadb client class to capture
`embedding_function=` on every `get_collection` / `create_collection`
call from `_get_collection(create=True)` and `_get_collection()`, and
fails if any call omits it.

Follow-up to #1262 / #1289 (which fixed the metadata-mismatch SIGSEGV
path); this addresses the EF-mismatch SIGSEGV path on the same surface.
---
 CHANGELOG.md             |  1 +
 mempalace/mcp_server.py  | 22 ++++++++++++++--
 tests/test_mcp_server.py | 56 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9f51968..337a7a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ### Bug Fixes
 
+- **MCP server `tool_diary_write` SIGSEGV when EF default differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x does not persist the EF identity with the collection, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` to the collection while the miner / Stop hook ingest path bound `mempalace.embedding.get_embedding_function()`. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default's lazy ONNX provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now resolves the EF via `mempalace.embedding.get_embedding_function` and passes it into both reopen branches, matching the miner/backend path. (#1299, follow-up to #1262 / #1289)
 - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180)
 - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
 - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index cce7a49..faa024c 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -66,6 +66,7 @@
     _pin_hnsw_threads,
     hnsw_capacity_status,
 )
+from .embedding import get_embedding_function  # noqa: E402
 from .query_sanitizer import sanitize_query  # noqa: E402
 from .searcher import search_memories  # noqa: E402
 from .palace_graph import (  # noqa: E402
@@ -278,6 +279,22 @@ def _get_collection(create=False):
     global _collection_cache, _metadata_cache, _metadata_cache_time
     try:
         client = _get_client()
+        # ChromaDB 1.x does not persist the embedding function with the
+        # collection, so a reader/writer that omits ``embedding_function=``
+        # silently gets the chromadb-built-in default. On bleeding-edge
+        # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
+        # the default's lazy ONNX provider selection can SIGSEGV the host
+        # process on first ``col.add()``. The miner / Stop hook ingest path
+        # avoids this because it routes through ``ChromaBackend.get_collection``
+        # which resolves the EF via ``mempalace.embedding.get_embedding_function``.
+        # The MCP server bypassed that abstraction; mirror its behaviour so
+        # ``tool_diary_write`` / ``tool_add_drawer`` get the same EF as mining.
+        try:
+            ef = get_embedding_function()
+        except Exception:
+            logger.exception("Failed to build embedding function; using chromadb default")
+            ef = None
+        ef_kwargs = {"embedding_function": ef} if ef is not None else {}
         if create:
             # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
             # HNSW insert path, which has a race in repairConnectionsForUpdate /
@@ -292,7 +309,7 @@ def _get_collection(create=False):
             # below skips the metadata-comparison codepath for existing
             # collections, mirroring the backend-layer fix from #1262.
             try:
-                raw = client.get_collection(_config.collection_name)
+                raw = client.get_collection(_config.collection_name, **ef_kwargs)
             except _ChromaNotFoundError:
                 raw = client.create_collection(
                     _config.collection_name,
@@ -301,13 +318,14 @@ def _get_collection(create=False):
                         "hnsw:num_threads": 1,
                         **_HNSW_BLOAT_GUARD,
                     },
+                    **ef_kwargs,
                 )
             _pin_hnsw_threads(raw)
             _collection_cache = ChromaCollection(raw)
             _metadata_cache = None
             _metadata_cache_time = 0
         elif _collection_cache is None:
-            raw = client.get_collection(_config.collection_name)
+            raw = client.get_collection(_config.collection_name, **ef_kwargs)
             _pin_hnsw_threads(raw)
             _collection_cache = ChromaCollection(raw)
             _metadata_cache = None
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 46e5f4a..f8148af 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -919,3 +919,59 @@ def _spy(self, *args, **kwargs):
         col2 = mcp_server._get_collection(create=True)
         assert col2 is not None
         assert calls == [], f"get_or_create_collection was called: {calls}"
+
+    def test_get_collection_passes_embedding_function(self, monkeypatch, config, palace_path, kg):
+        """Regression for #1299.
+
+        ``mcp_server._get_collection`` must pass ``embedding_function=`` into
+        both ``client.get_collection`` and ``client.create_collection``,
+        mirroring ``ChromaBackend.get_collection``. Without it, ChromaDB 1.x
+        falls back to its built-in ``DefaultEmbeddingFunction`` (whose lazy
+        ONNX provider selection has SIGSEGV'd on python 3.14 + Apple Silicon),
+        and writers/readers can disagree with the miner about which EF is
+        bound to the collection. The miner / Stop hook ingest path routes
+        through ``ChromaBackend.get_collection`` which does this correctly;
+        the MCP server must match.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        client = mcp_server._get_client()
+        client_cls = type(client)
+        captured: dict[str, list[dict]] = {"get": [], "create": []}
+        real_get = client_cls.get_collection
+        real_create = client_cls.create_collection
+
+        def _spy_get(self, name, **kwargs):
+            captured["get"].append(dict(kwargs))
+            return real_get(self, name, **kwargs)
+
+        def _spy_create(self, name, **kwargs):
+            captured["create"].append(dict(kwargs))
+            return real_create(self, name, **kwargs)
+
+        monkeypatch.setattr(client_cls, "get_collection", _spy_get)
+        monkeypatch.setattr(client_cls, "create_collection", _spy_create)
+        mcp_server._collection_cache = None
+
+        col = mcp_server._get_collection(create=True)
+        assert col is not None
+
+        all_calls = captured["get"] + captured["create"]
+        assert all_calls, "expected get_collection or create_collection to be called"
+        for kwargs in all_calls:
+            assert (
+                "embedding_function" in kwargs
+            ), f"missing embedding_function= in chromadb call: {kwargs}"
+            assert kwargs["embedding_function"] is not None
+
+        # Same expectation on the create=False (cache-miss) reopen path.
+        mcp_server._collection_cache = None
+        captured["get"].clear()
+        captured["create"].clear()
+        col2 = mcp_server._get_collection()
+        assert col2 is not None
+        assert captured["get"], "expected get_collection on cache-miss reopen"
+        for kwargs in captured["get"]:
+            assert "embedding_function" in kwargs
+            assert kwargs["embedding_function"] is not None

From cd98d6674e6cdb841146a17243f7e947bfeebfb8 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 1 May 2026 19:46:59 -0300
Subject: [PATCH 016/127] fix(mcp_server): address copilot review on #1303
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Resolve the EF inside the two reopen branches that actually call
  `client.get_collection` / `client.create_collection`, so warm-cache
  reads stay zero-cost (no `MempalaceConfig()` / `_resolve_providers`
  on every tool call).
- Reuse `ChromaBackend._resolve_embedding_function()` instead of
  duplicating its try/except + log message + None-fallback.
- Reword the inline + CHANGELOG explanation to clarify that ChromaDB 1.x
  persists the EF *identity* (its `name()`) but not the *instance/
  configuration* — `mempalace.embedding` documents this and spoofs
  `name()` to `"default"` precisely so the identity check passes; the
  bug was the *provider list* (lazy ONNX selection) silently differing.
---
 CHANGELOG.md            |  2 +-
 mempalace/mcp_server.py | 35 +++++++++++++++++++----------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 337a7a1..41dfaac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ### Bug Fixes
 
-- **MCP server `tool_diary_write` SIGSEGV when EF default differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x does not persist the EF identity with the collection, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` to the collection while the miner / Stop hook ingest path bound `mempalace.embedding.get_embedding_function()`. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default's lazy ONNX provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now resolves the EF via `mempalace.embedding.get_embedding_function` and passes it into both reopen branches, matching the miner/backend path. (#1299, follow-up to #1262 / #1289)
+- **MCP server `tool_diary_write` SIGSEGV when default EF provider differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x persists the EF *identity* (its `name()`) with the collection but not the EF *instance/configuration*, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` — its `name()` matches `mempalace.embedding`'s spoofed `"default"` so the identity check passes, but its provider list is chromadb's default rather than the user's resolved device. The miner / Stop hook ingest path routes through the backend helper and binds the configured EF instead. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now reuses `ChromaBackend._resolve_embedding_function()` on the reopen branches that actually open a collection (warm-cache reads stay zero-cost), matching the miner/backend path. (#1299, follow-up to #1262 / #1289)
 - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180)
 - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
 - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index faa024c..13654f6 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -66,7 +66,6 @@
     _pin_hnsw_threads,
     hnsw_capacity_status,
 )
-from .embedding import get_embedding_function  # noqa: E402
 from .query_sanitizer import sanitize_query  # noqa: E402
 from .searcher import search_memories  # noqa: E402
 from .palace_graph import (  # noqa: E402
@@ -279,23 +278,25 @@ def _get_collection(create=False):
     global _collection_cache, _metadata_cache, _metadata_cache_time
     try:
         client = _get_client()
-        # ChromaDB 1.x does not persist the embedding function with the
-        # collection, so a reader/writer that omits ``embedding_function=``
-        # silently gets the chromadb-built-in default. On bleeding-edge
+        # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
+        # collection but not the EF *instance/configuration*. So a reader or
+        # writer that omits ``embedding_function=`` silently gets chromadb's
+        # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
+        # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
+        # the identity check passes), but the *provider list* is chromadb's
+        # default rather than the user's resolved device. On bleeding-edge
         # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
-        # the default's lazy ONNX provider selection can SIGSEGV the host
-        # process on first ``col.add()``. The miner / Stop hook ingest path
-        # avoids this because it routes through ``ChromaBackend.get_collection``
-        # which resolves the EF via ``mempalace.embedding.get_embedding_function``.
-        # The MCP server bypassed that abstraction; mirror its behaviour so
-        # ``tool_diary_write`` / ``tool_add_drawer`` get the same EF as mining.
-        try:
-            ef = get_embedding_function()
-        except Exception:
-            logger.exception("Failed to build embedding function; using chromadb default")
-            ef = None
-        ef_kwargs = {"embedding_function": ef} if ef is not None else {}
+        # that default provider selection can SIGSEGV the host process on
+        # first ``col.add()``. The miner / Stop hook ingest path avoids this
+        # because it routes through ``ChromaBackend.get_collection``, which
+        # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
+        # the MCP server bypassed that abstraction. Resolve the EF inside the
+        # branches that actually open a collection so warm-cache reads stay
+        # zero-cost. Reuse the backend helper so the two call sites can't
+        # drift on logging or fallback semantics.
         if create:
+            ef = ChromaBackend._resolve_embedding_function()
+            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
             # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
             # HNSW insert path, which has a race in repairConnectionsForUpdate /
             # addPoint (see issues #974, #965). Set via metadata on fresh
@@ -325,6 +326,8 @@ def _get_collection(create=False):
             _metadata_cache = None
             _metadata_cache_time = 0
         elif _collection_cache is None:
+            ef = ChromaBackend._resolve_embedding_function()
+            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
             raw = client.get_collection(_config.collection_name, **ef_kwargs)
             _pin_hnsw_threads(raw)
             _collection_cache = ChromaCollection(raw)

From 6509071b8e93919bb1f21dd625457fbf3a59ed5e Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 00:47:24 -0300
Subject: [PATCH 017/127] =?UTF-8?q?feat(searcher):=20add=20candidate=5Fstr?=
 =?UTF-8?q?ategy=3D"union"=20for=20vector=E2=88=AABM25=20reranking=20pool?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Default search behavior is unchanged. Opt-in candidate_strategy="union"
also pulls top-K BM25-only candidates from sqlite FTS5 and merges them
into the rerank pool, catching docs with strong BM25 signal that the
vector index didn't surface in the over-fetch window.

Motivation
----------
The current hybrid path gathers candidates from the vector index only
(n_results * 3 over-fetch), then BM25-reranks within them. When the
query embeds close to the wrong content semantically, the right doc
never enters the rerank pool — *no matter how wide the over-fetch*.
Tested on a ~6K-document mixed corpus (knowledge prose + short structured
records): at *30x* over-fetch (~5% of the corpus) the target doc still
didn't surface for narrative-shaped queries targeting terminology guides.
Wider over-fetch isn't the answer; widening the pool's *source* is.

Concrete failure mode: a narrative-shaped query embeds close to records
sharing the same operational vocabulary (other narrative entries in the
corpus). A terminology / style guide is BM25-strong for the query
(rare keywords the guide repeats) but vector-distant. Vector-only
candidates don't include it; BM25 never gets to rerank it. The hybrid
path produces 0.00 recall on a probe that pure BM25 alone scores 1.00 —
the hybrid is worse than its component on the same input.

Behavior change
---------------
* New parameter ``candidate_strategy: str = "vector"`` on ``search_memories``.
  - ``"vector"`` (default): historical behavior, no change.
  - ``"union"``: also fetch top ``n_results * 3`` candidates via the
    existing ``_bm25_only_via_sqlite`` helper, dedupe by source_file,
    merge into the rerank pool. BM25-only candidates carry
    ``distance=None`` so they're scored on BM25 contribution alone
    (vec_sim coerces to 0).
* ``_hybrid_rank`` now handles ``distance=None`` explicitly, scoring
  such candidates as vector-unknown (vec_sim=0) rather than treating
  it as max-distance via shim.
* New strategies register via ``_CANDIDATE_MERGERS``; dispatch is in
  ``_apply_candidate_strategy`` so ``search_memories`` stays under the
  C901 complexity ceiling.

Bench numbers (~6K-doc internal mixed corpus, recall@10, 5 probes spanning
policy-exception lookup, temporal-decay, style retrieval, set-difference,
and pattern-recognition):

                              baseline ("vector")   "union"
  policy-exception probe        0.00                  0.50    +0.50
  temporal-decay probe          0.17                  0.50    +0.33
  style-retrieval probe         0.00                  1.00    +1.00 (PASSES)
  set-difference probe          0.00–0.06             0.06–0.09  ~
  pattern-recog probe           0.64 (stable)         0.50–0.71  variance, typ. +0.07
  macro recall                  0.16–0.17             0.51–0.56  +0.34 to +0.40

The pattern-recog variance points at a related issue worth a separate PR:
``_hybrid_rank`` computes BM25 IDF over the candidate set. Adding new
candidates re-normalizes BM25 for *existing* candidates non-monotonically.
Stable corpus-wide BM25 would remove this. Out of scope here.

Tests
-----
``tests/test_hybrid_candidate_union.py`` (6 tests, all pass):
- default behavior unchanged (explicit ``"vector"`` matches default)
- ``"union"`` surfaces a BM25-strong vector-distant doc
- ``"union"`` doesn't drop docs ``"vector"`` would have found
- empty-palace handling
- invalid ``candidate_strategy`` raises
- ``_hybrid_rank`` tolerates ``distance=None``

Existing ``test_hybrid_search.py`` (5) and ``test_searcher.py`` (27) pass.

Performance note
----------------
Each ``"union"`` query adds one sqlite open + FTS5 MATCH + metadata
fetch (via the existing ``_bm25_only_via_sqlite`` helper, which already
runs as the ``vector_disabled`` fallback path so the code is
well-trodden). Per-query overhead is small but unmeasured at corpus
scale. Default stays ``"vector"`` until a maintainer characterizes the
cost.
---
 mempalace/searcher.py                | 104 ++++++++++++++++++++-
 tests/test_hybrid_candidate_union.py | 133 +++++++++++++++++++++++++++
 2 files changed, 236 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_hybrid_candidate_union.py

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index a14d90d..7a46158 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -134,6 +134,11 @@ def _hybrid_rank(
       themselves. Since the absolute scale is unbounded, BM25 is min-max
       normalized within the candidate set so weights are commensurable.
 
+    Candidates with ``distance=None`` are treated as vector-unknown
+    (no vector signal available) and scored on BM25 contribution alone.
+    Used by candidate-union mode to merge BM25-only candidates that the
+    vector index didn't surface.
+
     Mutates each result dict to add ``bm25_score`` and reorders the list
     in place. Returns the same list for convenience.
     """
@@ -147,7 +152,11 @@ def _hybrid_rank(
 
     scored = []
     for r, raw, norm in zip(results, bm25_raw, bm25_norm):
-        vec_sim = max(0.0, 1.0 - r.get("distance", 1.0))
+        distance = r.get("distance")
+        if distance is None:
+            vec_sim = 0.0
+        else:
+            vec_sim = max(0.0, 1.0 - distance)
         r["bm25_score"] = round(raw, 3)
         scored.append((vector_weight * vec_sim + bm25_weight * norm, r))
 
@@ -545,6 +554,79 @@ def _bm25_only_via_sqlite(
     }
 
 
+def _merge_bm25_union_candidates(
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+) -> None:
+    """Append top-K BM25-only candidates from sqlite into ``hits`` in place.
+
+    Used by ``search_memories(..., candidate_strategy="union")`` to widen
+    the rerank pool's *source* (not just its size) — vector-only candidate
+    selection skips docs whose embeddings are far from the query even when
+    BM25 signal is strong. We dedupe against existing hits by ``source_file``
+    so vector-side entries (which carry real distance values) win on
+    collisions; BM25-only additions are marked with ``distance=None`` so
+    ``_hybrid_rank`` scores them on BM25 contribution alone.
+    """
+    try:
+        bm25_extra = _bm25_only_via_sqlite(
+            query,
+            palace_path,
+            wing=wing,
+            room=room,
+            n_results=n_results * 3,
+        ).get("results", [])
+    except Exception:
+        logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
+        return
+
+    seen_sources = {h.get("source_file") for h in hits}
+    for bh in bm25_extra:
+        key = bh.get("source_file")
+        if not key or key == "?" or key in seen_sources:
+            continue
+        bh["distance"] = None
+        bh["effective_distance"] = None
+        bh["closet_boost"] = 0.0
+        hits.append(bh)
+        seen_sources.add(key)
+
+
+# Strategy dispatch — keeps search_memories' branch count under the
+# project's complexity ceiling (C901 max-complexity=25). New strategies
+# register here.
+_CANDIDATE_MERGERS = {
+    "vector": None,  # default no-op
+    "union": _merge_bm25_union_candidates,
+}
+
+
+def _apply_candidate_strategy(
+    strategy: str,
+    hits: list,
+    query: str,
+    palace_path: str,
+    wing: str,
+    room: str,
+    n_results: int,
+) -> None:
+    """Dispatch to the registered merger for ``strategy``.
+
+    Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op.
+    """
+    if strategy not in _CANDIDATE_MERGERS:
+        raise ValueError(
+            f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}"
+        )
+    merger = _CANDIDATE_MERGERS[strategy]
+    if merger is not None:
+        merger(hits, query, palace_path, wing, room, n_results)
+
+
 def search_memories(
     query: str,
     palace_path: str,
@@ -553,6 +635,7 @@ def search_memories(
     n_results: int = 5,
     max_distance: float = 0.0,
     vector_disabled: bool = False,
+    candidate_strategy: str = "vector",
 ) -> dict:
     """Programmatic search — returns a dict instead of printing.
 
@@ -572,6 +655,20 @@ def search_memories(
             (#1222). Set by the MCP server when the HNSW capacity probe
             detects a divergence that would segfault chromadb on segment
             load.
+        candidate_strategy: How candidates for the hybrid re-rank are gathered.
+
+            * ``"vector"`` (default) — preserves historical behavior: top
+              ``n_results * 3`` rows from the vector index are the rerank pool.
+              Cheap; works well when query and target docs agree in the
+              embedding space.
+            * ``"union"`` — also pull top ``n_results * 3`` BM25 candidates
+              from the sqlite FTS5 index and merge them into the rerank pool
+              (deduped by source_file). Catches docs with strong BM25 signal
+              that are vector-distant from the query (e.g. terminology guides
+              looked up by narrative-shaped queries; policy clauses surfaced
+              by scenario descriptions). Adds one sqlite open + FTS5 MATCH
+              per query; perf cost is small but unmeasured at corpus scale.
+              Opt in until the cost is characterized.
     """
     if vector_disabled:
         return _bm25_only_via_sqlite(
@@ -748,6 +845,11 @@ def search_memories(
         h["drawer_index"] = best_idx
         h["total_drawers"] = len(ordered_docs)
 
+    # Candidate strategy hook: optionally widen the rerank pool's *source*
+    # before ranking. Default ("vector") is a no-op; "union" merges top-K
+    # BM25 candidates from sqlite. See `_apply_candidate_strategy`.
+    _apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results)
+
     # BM25 hybrid re-rank within the final candidate set.
     hits = _hybrid_rank(hits, query)
     for h in hits:
diff --git a/tests/test_hybrid_candidate_union.py b/tests/test_hybrid_candidate_union.py
new file mode 100644
index 0000000..97cf4d1
--- /dev/null
+++ b/tests/test_hybrid_candidate_union.py
@@ -0,0 +1,133 @@
+"""Tests for ``candidate_strategy="union"`` in ``search_memories``.
+
+The default ``"vector"`` strategy gathers candidates from the vector index
+only. Docs with strong BM25 signal but vector embeddings far from the query
+get skipped — terminology guides looked up by narrative-shaped queries are
+the canonical case.
+
+The ``"union"`` strategy also pulls top-K BM25-only candidates from sqlite
+FTS5 and merges them into the rerank pool. Both signal sources contribute
+candidates; the hybrid rerank picks the best from a richer pool.
+
+Default behavior is unchanged ("vector") — these tests exercise opt-in
+"union" mode.
+"""
+
+from mempalace.palace import get_collection
+from mempalace.searcher import search_memories
+
+
+def _seed_drawers(palace_path):
+    """Seed a corpus where the right doc for one query is BM25-strong but
+    vector-distant.
+
+    D1-D3 are short narrative tickets that semantically cluster around
+    "customer support / order / shipped" vocabulary. D4 is a meta-document
+    of bullet rules ("brand voice") that contains rare keywords like
+    "Absolutely" and "apologize" the query repeats verbatim — strong BM25
+    signal but stylistically far from the narrative tickets.
+    """
+    col = get_collection(palace_path, create=True)
+    col.upsert(
+        ids=["D1", "D2", "D3", "D4"],
+        documents=[
+            "Customer wrote in asking why their order shipped without "
+            "the promo sticker. Standard reply explaining the threshold.",
+            "Order delivery delayed three days; customer requested a "
+            "refund. Support agent processed return via ticket queue.",
+            "Customer asked about the missing freebie; the reply "
+            "explained the campaign mechanics and shipped status.",
+            "Brand voice rules: dry, sturdy, never effusive. "
+            "Never 'Absolutely!' Never apologize for policy — explain it. "
+            "Avoid premium / curated / elevated vocabulary.",
+        ],
+        metadatas=[
+            {"wing": "shop", "room": "support", "source_file": "ticket_D1.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D2.md"},
+            {"wing": "shop", "room": "support", "source_file": "ticket_D3.md"},
+            {"wing": "shop", "room": "guides", "source_file": "brand_voice_D4.md"},
+        ],
+    )
+
+
+_NARRATIVE_QUERY = (
+    "A support agent is drafting a reply to a customer asking why their "
+    "order shipped without a free sticker. Draft the reply, but never say "
+    "'Absolutely!' and do not apologize for policy."
+)
+
+
+class TestCandidateUnion:
+    def test_default_vector_strategy_unchanged(self, tmp_path):
+        """Default behavior must be identical to omitting the parameter."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        without = search_memories(_NARRATIVE_QUERY, palace, n_results=5)
+        with_default = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector"
+        )
+        ids_a = [h["source_file"] for h in without["results"]]
+        ids_b = [h["source_file"] for h in with_default["results"]]
+        assert ids_a == ids_b, "explicit candidate_strategy='vector' must match default"
+
+    def test_union_surfaces_bm25_strong_vector_distant_doc(self, tmp_path):
+        """The brand-voice doc has strong BM25 signal for the query but is
+        stylistically far from the narrative tickets. Union mode must
+        retrieve it; vector-only mode is allowed to miss it."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        ids = [h["source_file"] for h in result["results"]]
+        assert "brand_voice_D4.md" in ids, (
+            "union mode must surface BM25-strong docs even when vector signal "
+            f"is weak; got {ids}"
+        )
+
+    def test_union_preserves_vector_hits(self, tmp_path):
+        """Union mode must not drop docs that vector-only mode finds —
+        the rerank pool grows, it doesn't shrink."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        vector = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="vector")
+        union = search_memories(_NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union")
+        vec_ids = {h["source_file"] for h in vector["results"]}
+        union_ids = {h["source_file"] for h in union["results"]}
+        # In a 4-doc corpus with n_results=5, both should return all 4.
+        # The invariant is: union should not lose anything vector found.
+        missing = vec_ids - union_ids
+        assert not missing, f"union dropped docs that vector found: {missing}"
+
+    def test_union_handles_empty_palace(self, tmp_path):
+        """No drawers — union mode should return empty results, not crash."""
+        palace = str(tmp_path / "palace")
+        get_collection(palace, create=True)  # create empty collection
+        result = search_memories("anything", palace, n_results=5, candidate_strategy="union")
+        assert result.get("results", []) == []
+
+    def test_invalid_candidate_strategy_raises(self, tmp_path):
+        """Bad arg should raise rather than silently fall back."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
+
+
+class TestHybridRankTolerantOfMissingDistance:
+    """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only
+    candidates injected by union mode."""
+
+    def test_distance_none_scored_as_zero_vector_sim(self):
+        from mempalace.searcher import _hybrid_rank
+
+        results = [
+            {"text": "alpha beta gamma", "distance": 0.2},  # close vector match
+            {"text": "alpha alpha alpha", "distance": None},  # BM25-only — heavy term repetition
+        ]
+        # Query matches "alpha" heavily; the BM25-only candidate with no
+        # vector signal should still rank competitively on BM25 alone.
+        ranked = _hybrid_rank(results, "alpha")
+        assert all("bm25_score" in r for r in ranked), "rerank should add bm25_score"
+        # Both must survive — neither should crash on distance=None.
+        assert len(ranked) == 2

From d07b730f08fc427a483620d2e9b61d4e3da692d4 Mon Sep 17 00:00:00 2001
From: Mikhail Valentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 05:25:11 +0500
Subject: [PATCH 018/127] fix(hooks): quote CLAUDE_PLUGIN_ROOT /
 CODEX_PLUGIN_ROOT in hooks.json (#1076) (#1077)

Shell splits hook command on whitespace after variable expansion, breaking
paths with spaces (e.g. C:\Users\Richard M on Windows). Wrapping the path
in double quotes preserves the token boundary.

Fixes the reported Stop/PreCompact pair in .claude-plugin/hooks/hooks.json
and applies the same fix to .codex-plugin/hooks.json (SessionStart/Stop/
PreCompact), which carries the identical bug.
---
 .claude-plugin/hooks/hooks.json | 4 ++--
 .codex-plugin/hooks.json        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.claude-plugin/hooks/hooks.json b/.claude-plugin/hooks/hooks.json
index f1f0a90..b80a785 100644
--- a/.claude-plugin/hooks/hooks.json
+++ b/.claude-plugin/hooks/hooks.json
@@ -6,7 +6,7 @@
         "hooks": [
           {
             "type": "command",
-            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/mempal-stop-hook.sh"
+            "command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/mempal-stop-hook.sh\""
           }
         ]
       }
@@ -16,7 +16,7 @@
         "hooks": [
           {
             "type": "command",
-            "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/mempal-precompact-hook.sh"
+            "command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/mempal-precompact-hook.sh\""
           }
         ]
       }
diff --git a/.codex-plugin/hooks.json b/.codex-plugin/hooks.json
index 46f7e66..02705f7 100644
--- a/.codex-plugin/hooks.json
+++ b/.codex-plugin/hooks.json
@@ -6,7 +6,7 @@
         "hooks": [
           {
             "type": "command",
-            "command": "${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh session-start"
+            "command": "\"${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh\" session-start"
           }
         ]
       }
@@ -17,7 +17,7 @@
         "hooks": [
           {
             "type": "command",
-            "command": "${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh stop"
+            "command": "\"${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh\" stop"
           }
         ]
       }
@@ -28,7 +28,7 @@
         "hooks": [
           {
             "type": "command",
-            "command": "${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh precompact"
+            "command": "\"${CODEX_PLUGIN_ROOT}/hooks/mempal-hook.sh\" precompact"
           }
         ]
       }

From 8472d553a34b83cb3ba6d50142640a8d2744d80c Mon Sep 17 00:00:00 2001
From: lcatlett <lindsey.catlett@gmail.com>
Date: Fri, 1 May 2026 19:26:43 -0400
Subject: [PATCH 019/127] fix(hooks): treat absent ~/.mempalace as auto-save
 off

When the user removes ~/.mempalace/ (a strong "do not auto-capture"
signal), the next hook fire would silently recreate the entire dir
hierarchy and ingest existing transcripts:

1. _log() at hooks_cli.py:148 unconditionally calls
   STATE_DIR.mkdir(parents=True, exist_ok=True), so the act of
   writing the hook log line recreated ~/.mempalace/hook_state/
2. With no config file present, hook_stop_auto_save and
   hook_precompact_auto_save defaulted to True (no override to read)
3. The full save path then ran, materializing palace/, wal/,
   knowledge_graph.sqlite3, and N drawers from existing transcripts
   in ~/.claude/projects/*.jsonl

All four entry points (hook_stop, hook_precompact, hook_session_start,
and _log itself) now check a new PALACE_ROOT = Path.home() / ".mempalace"
constant first and short-circuit (returning {} on stdout, never logging)
when the dir is absent. The user-removable directory is now a kill-switch.

Five unit tests in tests/test_hooks_cli.py cover: hook_stop /
hook_precompact / hook_session_start do not create the dir when absent;
_log() does not create it when absent; existing dir proceeds normally
(regression).

Caught in the wild on a downstream fork: ~146 drawers materialized in
under a second after a deliberate `rm -rf ~/.mempalace/`, into a planning
session that was explicitly not meant to be captured.
---
 CHANGELOG.md            |  1 +
 mempalace/hooks_cli.py  | 23 +++++++++++++
 tests/test_hooks_cli.py | 76 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 100 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41dfaac..7a75842 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 ### Bug Fixes
 
 - **MCP server `tool_diary_write` SIGSEGV when default EF provider differs.** `mcp_server._get_collection` bypassed `ChromaBackend.get_collection` and called `client.get_collection` / `client.create_collection` without `embedding_function=`. ChromaDB 1.x persists the EF *identity* (its `name()`) with the collection but not the EF *instance/configuration*, so the MCP server's reopen silently bound chromadb's built-in `DefaultEmbeddingFunction` — its `name()` matches `mempalace.embedding`'s spoofed `"default"` so the identity check passes, but its provider list is chromadb's default rather than the user's resolved device. The miner / Stop hook ingest path routes through the backend helper and binds the configured EF instead. On bleeding-edge interpreters (python 3.14 + chromadb 1.5.x on Apple Silicon) the default provider selection could SIGSEGV the host process on first `col.add()`, killing the MCP stdio server and leaving every subsequent tool call returning `Connection closed` until Claude Code was relaunched. `_get_collection` now reuses `ChromaBackend._resolve_embedding_function()` on the reopen branches that actually open a collection (warm-cache reads stay zero-cost), matching the miner/backend path. (#1299, follow-up to #1262 / #1289)
+- **Hooks no longer recreate `~/.mempalace/` after the user removes it.** When `~/.mempalace/` is deleted (a strong "do not auto-capture" signal), the next `Stop`, `PreCompact`, or `SessionStart` hook would silently rebuild the dir hierarchy and ingest existing transcripts: `_log()` called `STATE_DIR.mkdir(parents=True, exist_ok=True)` unconditionally, so the very act of writing `[HH:MM] SESSION START …` recreated `~/.mempalace/hook_state/`; subsequent calls in the save path then materialized `palace/`, `wal/`, `knowledge_graph.sqlite3`, and N drawers from `~/.claude/projects/*.jsonl`. All four entry points (`hook_stop`, `hook_precompact`, `hook_session_start`, and `_log` itself) now check a new module-level `PALACE_ROOT = Path.home() / ".mempalace"` constant first and short-circuit (returning `{}` on stdout, never logging) when the directory is absent. The user-removable directory becomes a kill-switch — `rm -rf ~/.mempalace` is now a stable state. Net: 23 lines added in `mempalace/hooks_cli.py`, 5 unit tests in `tests/test_hooks_cli.py`. (#1305)
 - **Cross-wing topic tunnels for hyphenated dir names.** `mempalace init` recorded the `topics_by_wing` registry key under the raw directory name (e.g. `mempalace-public`), while `mempalace.yaml`'s `wing` field used the lower-cased + separator-collapsed slug (`mempalace_public`). At mine time the miner read the slug from the yaml and missed the registry, so `_compute_topic_tunnels_for_wing` returned `0` silently. Real-world: any project whose folder contained a hyphen or space lost every topic tunnel. Now both call sites route through a shared `normalize_wing_name()` in `config.py`. (#1194, follow-up to #1180)
 - **CLI `mempalace search` retrieval quality.** The CLI was using pure ChromaDB cosine distance with no BM25 rerank, so drawers containing every query term but embedding as noise (directory listings, diff output, shell logs) scored `Match: 0.0` alongside genuinely irrelevant results with no way to tell them apart. Wired the CLI through the same `_hybrid_rank` the `mempalace_search` MCP tool already used, and surfaced both `cosine=` and `bm25=` scores in the output so users see which component of the match is firing. MCP search was unaffected; this fixes the human-facing CLI parity gap.
 - **Legacy-palace distance-metric warning.** CLI search now detects palaces created before `hnsw:space=cosine` was consistently set and prints a one-line notice pointing at `mempalace repair`. Without the warning such palaces silently used L2 distance, under which the similarity display floored every result to `Match: 0.0`. New palaces mined today already set cosine correctly and now have invariant tests pinning that behavior so future refactors can't silently regress it. (#1179)
diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index d4f8317..ca8fb60 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -16,6 +16,18 @@
 
 SAVE_INTERVAL = 15
 STATE_DIR = Path.home() / ".mempalace" / "hook_state"
+PALACE_ROOT = Path.home() / ".mempalace"
+
+
+def _palace_root_exists() -> bool:
+    """User-removable kill-switch.
+
+    If ~/.mempalace/ does not exist, the user has explicitly cleared it.
+    All hook side effects (logging, state dir creation, mining, ingestion)
+    must respect this and short-circuit BEFORE touching disk — including
+    before logging the short-circuit itself.
+    """
+    return PALACE_ROOT.exists()
 
 
 def _mempalace_python() -> str:
@@ -142,6 +154,8 @@ def _count_human_messages(transcript_path: str) -> int:
 
 def _log(message: str):
     """Append to hook state log file."""
+    if not PALACE_ROOT.exists():
+        return  # User removed the palace; do not recreate by logging
     global _state_dir_initialized
     try:
         if not _state_dir_initialized:
@@ -550,6 +564,9 @@ def _wing_from_transcript_path(transcript_path: str) -> str:
 
 def hook_stop(data: dict, harness: str):
     """Stop hook: block every N messages for auto-save."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
     stop_hook_active = parsed["stop_hook_active"]
@@ -659,6 +676,9 @@ def hook_stop(data: dict, harness: str):
 
 def hook_session_start(data: dict, harness: str):
     """Session start hook: initialize session tracking state."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
 
@@ -673,6 +693,9 @@ def hook_session_start(data: dict, harness: str):
 
 def hook_precompact(data: dict, harness: str):
     """Precompact hook: mine transcript synchronously, then allow compaction."""
+    if not _palace_root_exists():
+        _output({})
+        return
     parsed = _parse_harness_input(data, harness)
     session_id = parsed["session_id"]
     transcript_path = parsed["transcript_path"]
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 1ceb530..941288d 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -959,3 +959,79 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
     # The injected value is not "true"/"1"/"yes", so the hook should NOT pass through.
     # Save must have been attempted.
     assert mock_save.called
+
+
+# --- Absent palace root: hooks must not recreate ~/.mempalace ---
+#
+# When the user removes ~/.mempalace (e.g. `rm -rf`), that is the strongest
+# possible "do not auto-capture" signal. Hooks must short-circuit BEFORE
+# touching disk — including before the log-line that previously triggered
+# STATE_DIR.mkdir() on its own.
+
+
+import mempalace.hooks_cli as hooks_cli_mod
+
+
+def _redirect_palace_root(monkeypatch, tmp_path):
+    """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
+    fake_root = tmp_path / "absent-mempalace"
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    return fake_root
+
+
+def test_hook_stop_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_stop(
+            {"session_id": "absent", "transcript_path": str(transcript), "stop_hook_active": False},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_precompact_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    transcript = tmp_path / "t.jsonl"
+    transcript.write_text("")
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_precompact(
+            {"session_id": "absent", "transcript_path": str(transcript)},
+            "claude-code",
+        )
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_hook_session_start_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "absent"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+    assert not fake_root.exists()
+
+
+def test_log_does_not_create_palace_dir_when_absent(tmp_path, monkeypatch):
+    fake_root = _redirect_palace_root(monkeypatch, tmp_path)
+    _log("test message")
+    assert not fake_root.exists()
+
+
+def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
+    """Regression: when PALACE_ROOT exists, hooks must proceed (no short-circuit)."""
+    fake_root = tmp_path / "present-mempalace"
+    fake_root.mkdir()
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+    _log("test message")
+    # _log should have created the state dir under the existing palace root
+    assert (fake_root / "hook_state").exists()
+    assert (fake_root / "hook_state" / "hook.log").is_file()

From 2d50b214d4c2c05112a3258faa8bef39cf6c07c0 Mon Sep 17 00:00:00 2001
From: lcatlett <lindsey.catlett@gmail.com>
Date: Sat, 2 May 2026 20:37:47 -0400
Subject: [PATCH 020/127] fix(hooks): use is_dir() for palace root check
 (review feedback)

Both @igorls and the Qodo bot flagged that `_palace_root_exists()` used
`Path.exists()`, which returns True for a regular file. A stray file at
`~/.mempalace` would let the kill-switch be bypassed and crash later in
`STATE_DIR.mkdir()` with NotADirectoryError.

Switched to `Path.is_dir()`. Also fold `_log()`'s inline check through
`_palace_root_exists()` so both kill-switch sites use the same predicate.

New test pins the behavior: a regular file at the palace root path is
treated as absent (hook short-circuits, _log does not crash, the stray
file is left untouched).
---
 mempalace/hooks_cli.py  |  9 +++++++--
 tests/test_hooks_cli.py | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index ca8fb60..8498103 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -26,8 +26,13 @@ def _palace_root_exists() -> bool:
     All hook side effects (logging, state dir creation, mining, ingestion)
     must respect this and short-circuit BEFORE touching disk — including
     before logging the short-circuit itself.
+
+    Uses ``is_dir()`` rather than ``exists()`` so a stray regular file at
+    ``~/.mempalace`` (or a broken symlink) is treated as absent — otherwise
+    the kill-switch would be bypassed and ``STATE_DIR.mkdir()`` would later
+    crash on ``NotADirectoryError``.
     """
-    return PALACE_ROOT.exists()
+    return PALACE_ROOT.is_dir()
 
 
 def _mempalace_python() -> str:
@@ -154,7 +159,7 @@ def _count_human_messages(transcript_path: str) -> int:
 
 def _log(message: str):
     """Append to hook state log file."""
-    if not PALACE_ROOT.exists():
+    if not _palace_root_exists():
         return  # User removed the palace; do not recreate by logging
     global _state_dir_initialized
     try:
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 941288d..487acf7 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -1035,3 +1035,35 @@ def test_existing_dir_proceeds_normally(tmp_path, monkeypatch):
     # _log should have created the state dir under the existing palace root
     assert (fake_root / "hook_state").exists()
     assert (fake_root / "hook_state" / "hook.log").is_file()
+
+
+def test_regular_file_at_palace_root_treated_as_absent(tmp_path, monkeypatch):
+    """A regular file at ~/.mempalace must be treated the same as absent.
+
+    ``Path.exists()`` returns True for a regular file, which would let the
+    kill-switch be bypassed and crash later when ``STATE_DIR.mkdir()`` runs
+    on ``NotADirectoryError``. ``_palace_root_exists()`` must use
+    ``is_dir()`` so a stray file (or broken symlink) short-circuits cleanly.
+    """
+    fake_root = tmp_path / "file-not-dir"
+    fake_root.write_text("oops, this is a file not a directory")
+    monkeypatch.setattr(hooks_cli_mod, "PALACE_ROOT", fake_root)
+    monkeypatch.setattr(hooks_cli_mod, "STATE_DIR", fake_root / "hook_state")
+    monkeypatch.setattr(hooks_cli_mod, "_state_dir_initialized", False)
+
+    # _palace_root_exists() is the source of truth — it must return False.
+    assert hooks_cli_mod._palace_root_exists() is False
+
+    # Hooks must short-circuit (return {} on stdout) and not touch disk.
+    buf = io.StringIO()
+    with contextlib.redirect_stdout(buf):
+        hook_session_start({"session_id": "file-at-root"}, "claude-code")
+    assert json.loads(buf.getvalue() or "{}") == {}
+
+    # _log must also short-circuit — it must NOT try to mkdir a path under a
+    # regular file (which would raise NotADirectoryError).
+    _log("test message")  # would raise if not short-circuited
+
+    # The stray file is left untouched; we never try to convert it.
+    assert fake_root.is_file()
+    assert fake_root.read_text() == "oops, this is a file not a directory"

From cbd6e5d65d15edb6026a238aea06e2736af7942a Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:54:14 -0300
Subject: [PATCH 021/127] fix(cli): write compress output to mempalace_closets
 so palace can read them (#1244)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`cmd_compress` was writing AAAK-compressed drawers to a `mempalace_compressed`
collection, but every read path (`palace.get_closets_collection`,
`searcher.py`, `repair.py`) reads from `mempalace_closets`. Result: for
non-mined palaces (or any palace where the user ran `mempalace compress`
expecting to backfill the closet/index layer), the compressed output was
silently invisible — written to a collection nothing else opens.

Fix the writer rather than renaming the readers: "closets" is the
user-visible feature name baked into the public API
(`get_closets_collection`), the searcher hybrid path, repair/HNSW
diagnostics, and docs. Renaming the readers would churn 15+ call sites
and the README for no benefit. The compressed AAAK strings are exactly
what closets are conceptually — compact pointers scanned by an LLM to
locate the right drawer — so they belong in `mempalace_closets`.

Tests:
- Update `test_cmd_compress_stores_results` to assert the collection
  name passed to `get_or_create_collection` is `mempalace_closets`.
- Add `test_cmd_compress_output_readable_via_get_closets_collection`:
  end-to-end with a real ChromaBackend, seed a drawer, run cmd_compress,
  then read back via the same `get_closets_collection` helper that
  palace.py / searcher use. Regression test for the wrong-collection
  bug.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/cli.py  |  4 ++--
 tests/test_cli.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index ca9798b..d47f38e 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -902,7 +902,7 @@ def cmd_compress(args):
     # Store compressed versions (unless dry-run)
     if not args.dry_run:
         try:
-            comp_col = backend.get_or_create_collection(palace_path, "mempalace_compressed")
+            comp_col = backend.get_or_create_collection(palace_path, "mempalace_closets")
             for doc_id, compressed, meta, stats in compressed_entries:
                 comp_meta = dict(meta)
                 comp_meta["compression_ratio"] = round(stats["size_ratio"], 1)
@@ -913,7 +913,7 @@ def cmd_compress(args):
                     metadatas=[comp_meta],
                 )
             print(
-                f"  Stored {len(compressed_entries)} compressed drawers in 'mempalace_compressed' collection."
+                f"  Stored {len(compressed_entries)} compressed drawers in 'mempalace_closets' collection."
             )
         except Exception as e:
             print(f"  Error storing compressed drawers: {e}")
diff --git a/tests/test_cli.py b/tests/test_cli.py
index af7b39d..74521e6 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -889,7 +889,7 @@ def test_cmd_compress_with_config(mock_config_cls, tmp_path, capsys):
 
 @patch("mempalace.cli.MempalaceConfig")
 def test_cmd_compress_stores_results(mock_config_cls, capsys):
-    """Non-dry-run compress stores to mempalace_compressed collection."""
+    """Non-dry-run compress stores to mempalace_closets collection (#1244)."""
     mock_config_cls.return_value.palace_path = "/fake/palace"
     args = argparse.Namespace(palace=None, wing=None, dry_run=False, config=None)
     mock_col = MagicMock()
@@ -927,6 +927,53 @@ def test_cmd_compress_stores_results(mock_config_cls, capsys):
     assert "Stored" in out
     assert "Total:" in out
     mock_comp_col.upsert.assert_called_once()
+    # Verify the compress output goes to the closets collection so that
+    # palace.get_closets_collection() / searcher can read it back (#1244).
+    (call_args, _kwargs) = mock_backend.get_or_create_collection.call_args
+    assert call_args[1] == "mempalace_closets", (
+        f"compress should write to mempalace_closets, got {call_args[1]!r}"
+    )
+    assert "mempalace_closets" in out
+
+
+def test_cmd_compress_output_readable_via_get_closets_collection(tmp_path, capsys):
+    """End-to-end: cmd_compress output must be readable via the same code
+    path palace.py uses (`get_closets_collection`). Regression for #1244."""
+    from mempalace.backends.chroma import ChromaBackend
+    from mempalace.palace import get_closets_collection, get_collection
+
+    palace_path = str(tmp_path / "palace")
+
+    # Seed a drawer in the palace so cmd_compress has something to compress.
+    drawers = get_collection(palace_path, "mempalace_drawers", create=True)
+    drawers.upsert(
+        ids=["drawer-1"],
+        documents=["The quick brown fox jumps over the lazy dog."],
+        metadatas=[{"wing": "test", "room": "demo", "source_file": "fox.txt"}],
+    )
+
+    args = argparse.Namespace(palace=palace_path, wing=None, dry_run=False, config=None)
+    with patch("mempalace.cli.MempalaceConfig") as mock_config_cls:
+        mock_config_cls.return_value.palace_path = palace_path
+        # Use a real ChromaBackend so the write actually lands on disk and
+        # the read-side helper can find it.
+        with patch("mempalace.backends.chroma.ChromaBackend", side_effect=ChromaBackend):
+            cmd_compress(args)
+
+    out = capsys.readouterr().out
+    assert "Stored" in out
+
+    # Now read via the *same* code path palace.py / searcher uses.
+    closets = get_closets_collection(palace_path, create=False)
+    got = closets.get(ids=["drawer-1"], include=["documents", "metadatas"])
+    assert got["ids"] == ["drawer-1"], (
+        "compressed drawer not found in mempalace_closets — "
+        "cmd_compress wrote to the wrong collection (#1244)"
+    )
+    assert got["documents"] and got["documents"][0], "empty compressed doc"
+    meta = got["metadatas"][0]
+    assert meta.get("wing") == "test"
+    assert "compression_ratio" in meta
 
 
 def test_cmd_repair_trailing_slash_does_not_recurse():

From e4e25ed186bed5b9b9529a5de368b32ce9e91a7c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:54:32 -0300
Subject: [PATCH 022/127] fix(mcp): forward valid_to and source params in
 kg_add/kg_invalidate (#1314)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`tool_kg_add` previously accepted only `valid_from` and `source_closet`,
silently dropping `valid_to`, `source_file`, and `source_drawer_id` at
the MCP boundary. Backfilling already-ended historical facts therefore
collapsed to "still current," and adapter provenance never reached
the SQLite layer even though `KnowledgeGraph.add_triple` already
supported every column.

`tool_kg_invalidate` returned the literal string `"today"` whenever the
caller omitted `ended`, hiding the actual stamped date from anyone trying
to verify what got persisted.

Changes:
- Extend `tool_kg_add` signature + MCP input_schema with `valid_to`,
  `source_file`, `source_drawer_id`; forward all of them to
  `_kg.add_triple` and to the WAL log.
- Resolve `ended` to `date.today().isoformat()` in `tool_kg_invalidate`
  before logging / returning, so the response always reports the actual
  date stored in `valid_to`.
- Add regression tests for valid_to round-trip, source_file /
  source_drawer_id provenance, and the resolved-ended-date contract.
- Leave TODO(#1283) markers so the open ISO-8601 validation PR can drop
  `validate_iso_date` over `valid_from` / `valid_to` / `ended` cleanly.

The underlying `KnowledgeGraph.add_triple` already accepted these
kwargs (RFC 002 §5.5) — only the MCP edge needed wiring up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/mcp_server.py  | 73 +++++++++++++++++++++++++-----
 tests/test_mcp_server.py | 96 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 153 insertions(+), 16 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 13654f6..1862737 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -47,7 +47,7 @@
 import logging  # noqa: E402
 import hashlib  # noqa: E402
 import time  # noqa: E402
-from datetime import datetime  # noqa: E402
+from datetime import date, datetime  # noqa: E402
 from pathlib import Path  # noqa: E402
 
 from .config import (  # noqa: E402
@@ -677,7 +677,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
             "vector_disabled": True,
             "vector_disabled_reason": _vector_disabled_reason,
             "hint": (
-                "duplicate detection requires vector search; run " "`mempalace repair` to restore"
+                "duplicate detection requires vector search; run `mempalace repair` to restore"
             ),
         }
     try:
@@ -1061,9 +1061,26 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
 
 
 def tool_kg_add(
-    subject: str, predicate: str, object: str, valid_from: str = None, source_closet: str = None
+    subject: str,
+    predicate: str,
+    object: str,
+    valid_from: str = None,
+    valid_to: str = None,
+    source_closet: str = None,
+    source_file: str = None,
+    source_drawer_id: str = None,
 ):
-    """Add a relationship to the knowledge graph."""
+    """Add a relationship to the knowledge graph.
+
+    All temporal and provenance fields are optional. ``valid_to`` lets callers
+    backfill historical facts with a known end date in a single call (instead
+    of a separate ``kg_invalidate``). ``source_file`` and ``source_drawer_id``
+    are RFC 002 §5.5 provenance fields populated by adapters / bulk importers.
+
+    TODO(#1283): once the ISO-8601 validation PR lands, wire ``validate_iso_date``
+    over ``valid_from`` / ``valid_to`` here so malformed dates fail fast at the
+    MCP boundary instead of silently producing empty query results.
+    """
     try:
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
@@ -1078,32 +1095,56 @@ def tool_kg_add(
             "predicate": predicate,
             "object": object,
             "valid_from": valid_from,
+            "valid_to": valid_to,
             "source_closet": source_closet,
+            "source_file": source_file,
+            "source_drawer_id": source_drawer_id,
         },
     )
     triple_id = _kg.add_triple(
-        subject, predicate, object, valid_from=valid_from, source_closet=source_closet
+        subject,
+        predicate,
+        object,
+        valid_from=valid_from,
+        valid_to=valid_to,
+        source_closet=source_closet,
+        source_file=source_file,
+        source_drawer_id=source_drawer_id,
     )
     return {"success": True, "triple_id": triple_id, "fact": f"{subject} → {predicate} → {object}"}
 
 
 def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
-    """Mark a fact as no longer true (set end date)."""
+    """Mark a fact as no longer true (set end date).
+
+    Returns the actual ``ended`` date that was stored — when the caller omits
+    ``ended``, the underlying graph stamps ``date.today()``, and the response
+    reflects that resolved value (instead of the literal string ``"today"``)
+    so callers can verify what was persisted.
+
+    TODO(#1283): apply ``validate_iso_date`` to ``ended`` once that PR lands.
+    """
     try:
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
     except ValueError as e:
         return {"success": False, "error": str(e)}
+    resolved_ended = ended or date.today().isoformat()
     _wal_log(
         "kg_invalidate",
-        {"subject": subject, "predicate": predicate, "object": object, "ended": ended},
+        {
+            "subject": subject,
+            "predicate": predicate,
+            "object": object,
+            "ended": resolved_ended,
+        },
     )
-    _kg.invalidate(subject, predicate, object, ended=ended)
+    _kg.invalidate(subject, predicate, object, ended=resolved_ended)
     return {
         "success": True,
         "fact": f"{subject} → {predicate} → {object}",
-        "ended": ended or "today",
+        "ended": resolved_ended,
     }
 
 
@@ -1440,7 +1481,7 @@ def tool_reconnect():
         "handler": tool_kg_query,
     },
     "mempalace_kg_add": {
-        "description": "Add a fact to the knowledge graph. Subject → predicate → object with optional time window. E.g. ('Max', 'started_school', 'Year 7', valid_from='2026-09-01').",
+        "description": "Add a fact to the knowledge graph. Subject → predicate → object with optional time window. E.g. ('Max', 'started_school', 'Year 7', valid_from='2026-09-01'). Pass valid_to to backfill an already-ended historical fact in a single call.",
         "input_schema": {
             "type": "object",
             "properties": {
@@ -1454,10 +1495,22 @@ def tool_reconnect():
                     "type": "string",
                     "description": "When this became true (YYYY-MM-DD, optional)",
                 },
+                "valid_to": {
+                    "type": "string",
+                    "description": "When this stopped being true (YYYY-MM-DD, optional). Use for backfilling already-ended historical facts.",
+                },
                 "source_closet": {
                     "type": "string",
                     "description": "Closet ID where this fact appears (optional)",
                 },
+                "source_file": {
+                    "type": "string",
+                    "description": "Source file path the fact was extracted from (optional)",
+                },
+                "source_drawer_id": {
+                    "type": "string",
+                    "description": "Drawer ID the fact was extracted from (optional, RFC 002 §5.5 provenance)",
+                },
             },
             "required": ["subject", "predicate", "object"],
         },
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index f8148af..2e769c2 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -476,9 +476,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert (
-            result1["drawer_id"] != result2["drawer_id"]
-        ), "Documents with shared header but different content must have distinct drawer IDs"
+        assert result1["drawer_id"] != result2["drawer_id"], (
+            "Documents with shared header but different content must have distinct drawer IDs"
+        )
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -650,6 +650,90 @@ def test_kg_invalidate(self, monkeypatch, config, palace_path, seeded_kg):
             ended="2026-03-01",
         )
         assert result["success"] is True
+        # Regression #1314: response must echo the actual ended date,
+        # not silently drop it and return the literal string "today".
+        assert result["ended"] == "2026-03-01"
+
+    def test_kg_add_forwards_valid_to(self, monkeypatch, config, palace_path, kg):
+        """Regression #1314 case 1: valid_to must round-trip through kg_add."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="_test_temporal",
+            predicate="had_value",
+            object="probe",
+            valid_from="2026-01-01",
+            valid_to="2026-04-28",
+        )
+        assert result["success"] is True
+
+        facts = kg.query_entity("_test_temporal")
+        assert len(facts) == 1
+        assert facts[0]["valid_from"] == "2026-01-01"
+        assert facts[0]["valid_to"] == "2026-04-28"
+        # An already-ended fact must not be reported as still current.
+        assert facts[0]["current"] is False
+
+    def test_kg_add_forwards_source_provenance(self, monkeypatch, config, palace_path, kg):
+        """Regression #1314 case 3: source_file / source_drawer_id reach storage."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace.mcp_server import tool_kg_add
+
+        result = tool_kg_add(
+            subject="operating-verb",
+            predicate="candidate",
+            object="husbandry",
+            valid_from="2026-04-28",
+            source_closet="closet-42",
+            source_file="docs/decisions.md",
+            source_drawer_id="drawer_abc123",
+        )
+        assert result["success"] is True
+
+        triple_id = result["triple_id"]
+        # Read raw row to verify all provenance columns persisted.
+        with kg._lock:
+            row = (
+                kg._conn()
+                .execute(
+                    "SELECT source_closet, source_file, source_drawer_id FROM triples WHERE id = ?",
+                    (triple_id,),
+                )
+                .fetchone()
+            )
+        assert row is not None
+        assert row["source_closet"] == "closet-42"
+        assert row["source_file"] == "docs/decisions.md"
+        assert row["source_drawer_id"] == "drawer_abc123"
+
+    def test_kg_invalidate_returns_actual_ended_date(
+        self, monkeypatch, config, palace_path, seeded_kg
+    ):
+        """Regression #1314 case 2: response reports the resolved date, not 'today'."""
+        from datetime import date as _date
+
+        _patch_mcp_server(monkeypatch, config, seeded_kg)
+        from mempalace.mcp_server import tool_kg_invalidate
+
+        # Caller-supplied date round-trips into the response.
+        explicit = tool_kg_invalidate(
+            subject="Max",
+            predicate="does",
+            object="swimming",
+            ended="2026-04-28",
+        )
+        assert explicit["ended"] == "2026-04-28"
+
+        # Caller-omitted date resolves to today's ISO date — never the
+        # literal string "today" the buggy implementation used to return.
+        implicit = tool_kg_invalidate(
+            subject="Max",
+            predicate="loves",
+            object="Chess",
+        )
+        assert implicit["ended"] != "today"
+        assert implicit["ended"] == _date.today().isoformat()
 
     def test_kg_timeline(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)
@@ -960,9 +1044,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert (
-                "embedding_function" in kwargs
-            ), f"missing embedding_function= in chromadb call: {kwargs}"
+            assert "embedding_function" in kwargs, (
+                f"missing embedding_function= in chromadb call: {kwargs}"
+            )
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From 01b3183e5dabf5ee20e2c6b97f68b8008c4828d1 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:56:31 -0300
Subject: [PATCH 023/127] fix(cli): honor --palace flag in cmd_init (#1313)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cmd_init was instantiating MempalaceConfig() unconditionally, ignoring
args.palace and always writing the palace under ~/.mempalace. Mirror
the env-var pattern used by mcp_server.py (and consistent with how
cmd_mine / cmd_status / cmd_search resolve --palace) so every
downstream read of cfg.palace_path inside cmd_init — Pass 0,
cfg.init(), and the post-init mine — routes to the user-specified
location.

Adds tests/test_cli.py::test_cmd_init_honors_palace_flag covering the
regression: asserts Pass 0 receives the --palace value (not
~/.mempalace) and that MEMPALACE_PALACE_PATH is set in os.environ.

Closes #1313.
---
 mempalace/cli.py  |  7 +++++++
 tests/test_cli.py | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index ca9798b..54856db 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -232,6 +232,13 @@ def cmd_init(args):
     from .project_scanner import discover_entities
     from .room_detector_local import detect_rooms_local
 
+    # Honor --palace (issue #1313): without this, init silently ignored the
+    # flag and always used ~/.mempalace. Mirror the env-var pattern used by
+    # mcp_server.py so every downstream read of ``cfg.palace_path`` (Pass 0,
+    # cfg.init(), the post-init mine) routes to the user-specified location.
+    if getattr(args, "palace", None):
+        os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(os.path.expanduser(args.palace))
+
     cfg = MempalaceConfig()
 
     # Resolve entity-detection languages: --lang overrides config.
diff --git a/tests/test_cli.py b/tests/test_cli.py
index af7b39d..c52e67f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -175,6 +175,55 @@ def test_cmd_init_normalizes_wing_name_for_topics_registry(mock_config_cls, tmp_
         assert mock_register.call_args.kwargs["wing"] == "my_cool_app"
 
 
+def test_cmd_init_honors_palace_flag(tmp_path, monkeypatch):
+    """Regression for #1313: ``cmd_init`` must honor ``--palace`` instead of
+    silently writing to ``~/.mempalace``. Mirrors the env-var pattern used
+    by ``cmd_mine`` / ``cmd_status`` / ``mcp_server`` so every downstream
+    read of ``cfg.palace_path`` (Pass 0, ``cfg.init()``, post-init mine)
+    routes to the user-specified location.
+    """
+    project = tmp_path / "project"
+    project.mkdir()
+    palace = tmp_path / "custom_palace"
+
+    # Make sure no leftover env var from another test leaks in — we want to
+    # verify that --palace ALONE drives the resolution.
+    monkeypatch.delenv("MEMPALACE_PALACE_PATH", raising=False)
+    monkeypatch.delenv("MEMPAL_PALACE_PATH", raising=False)
+
+    args = argparse.Namespace(
+        dir=str(project),
+        palace=str(palace),
+        yes=True,
+        auto_mine=False,
+    )
+
+    captured = {}
+
+    def fake_pass_zero(project_dir, palace_dir, llm_provider):
+        # Capture the palace_dir Pass 0 sees — this is the smoking-gun
+        # value for the bug. Pre-fix it was always ~/.mempalace.
+        captured["pass_zero_palace_dir"] = palace_dir
+        return None
+
+    with (
+        patch("mempalace.entity_detector.scan_for_detection", return_value=[]),
+        patch("mempalace.room_detector_local.detect_rooms_local"),
+        patch("mempalace.cli._run_pass_zero", side_effect=fake_pass_zero),
+        patch("mempalace.cli._maybe_run_mine_after_init"),
+    ):
+        cmd_init(args)
+
+    expected = str(palace)
+    # Pass 0 must have been handed the --palace location, not ~/.mempalace.
+    assert captured["pass_zero_palace_dir"] == expected
+    # And the env var must point at the custom palace so any downstream
+    # ``cfg.palace_path`` read in this process resolves correctly too.
+    import os
+
+    assert os.environ.get("MEMPALACE_PALACE_PATH") == os.path.abspath(expected)
+
+
 @patch("mempalace.cli.MempalaceConfig")
 def test_cmd_init_with_entities_zero_total(mock_config_cls, tmp_path, capsys):
     """When entities detected but total is 0, prints 'No entities' message."""

From 10733f1df474ea9e97a7a74e8b33ede5239a8138 Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:56:36 -0300
Subject: [PATCH 024/127] fix(backends/chroma): wire quarantine_stale_hnsw into
 _client() to prevent SIGSEGV on stale HNSW (#1121, #1132, #1263)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #1173 wired quarantine_stale_hnsw into the static make_client() helper
but not into the instance _client() method. As a result every non-MCP
entry point (CLI mining, search, repair, status) — which all use
get_collection / _get_or_create_collection / _client() — skipped the
cold-start quarantine pass and could SIGSEGV on a stale HNSW segment
left over from a partial flush, replicated palace, or crashed-mid-write.

Refactor: extract the (_fix_blob_seq_ids + gated quarantine_stale_hnsw)
pre-open pass into a single private static helper
ChromaBackend._prepare_palace_for_open(). Both make_client() and
_client() now route through it, so the _quarantined_paths once-per-
palace-per-process gate is preserved (no runtime thrash on hot paths)
and behaviour stays identical — the fix is purely about extending the
existing protection to the path that was missing it.

Tests:

- test_client_quarantines_corrupt_segment_on_first_open mirrors the
  existing make_client test and verifies _client() actually renames a
  corrupt segment on first open.
- test_client_quarantines_only_on_first_call_per_palace verifies the
  cache gate prevents re-running quarantine across repeated _client()
  calls — important because _client() is hit on every backend op.

Closes #1121. Closes #1132. Closes #1263.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/backends/chroma.py | 32 ++++++++++++++++---
 tests/test_backends.py       | 61 ++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index 01ac627..d9b99a4 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -993,7 +993,7 @@ def _client(self, palace_path: str):
         )
 
         if cached is None or inode_changed or mtime_changed or mtime_appeared:
-            _fix_blob_seq_ids(palace_path)
+            ChromaBackend._prepare_palace_for_open(palace_path)
             cached = chromadb.PersistentClient(path=palace_path)
             self._clients[palace_path] = cached
             # Re-stat after the client constructor runs: chromadb creates
@@ -1028,6 +1028,31 @@ def _client(self, palace_path: str):
     # safety property; locking would add cost without correctness gain.
     _quarantined_paths: set[str] = set()
 
+    @staticmethod
+    def _prepare_palace_for_open(palace_path: str) -> None:
+        """Run the pre-open safety pass shared by :meth:`make_client` and
+        :meth:`_client`.
+
+        Two steps, both required before constructing a ``PersistentClient``:
+
+        1. ``_fix_blob_seq_ids`` — repairs the BLOB seq_id quirk that bites
+           certain chromadb migrations.
+        2. ``quarantine_stale_hnsw`` — gated by :attr:`_quarantined_paths` so
+           it fires once per palace per process. This is the SIGSEGV
+           prevention path for stale HNSW segments (see #1121, #1132, #1263);
+           wiring it through this helper means CLI mining, search, repair,
+           and status all benefit, not just the legacy ``make_client``
+           callers.
+
+        Idempotent: safe to call from any code path that is about to open or
+        re-open a palace. The ``_quarantined_paths`` gate prevents thrash on
+        hot paths (e.g. ``_client()`` is called on every backend operation).
+        """
+        _fix_blob_seq_ids(palace_path)
+        if palace_path not in ChromaBackend._quarantined_paths:
+            quarantine_stale_hnsw(palace_path)
+            ChromaBackend._quarantined_paths.add(palace_path)
+
     @staticmethod
     def make_client(palace_path: str):
         """Create a fresh ``PersistentClient`` (fixes BLOB seq_ids first).
@@ -1040,10 +1065,7 @@ def make_client(palace_path: str):
         :attr:`_quarantined_paths` for the rationale (cold-start protection
         vs. runtime thrash on steady-write daemons).
         """
-        _fix_blob_seq_ids(palace_path)
-        if palace_path not in ChromaBackend._quarantined_paths:
-            quarantine_stale_hnsw(palace_path)
-            ChromaBackend._quarantined_paths.add(palace_path)
+        ChromaBackend._prepare_palace_for_open(palace_path)
         return chromadb.PersistentClient(path=palace_path)
 
     @staticmethod
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 5efa71b..8364dc7 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -764,6 +764,67 @@ def _spy(path, stale_seconds=300.0):
     assert calls == [palace_a, palace_b]
 
 
+# ── _client() cold-start gate (#1121, #1132, #1263) ──────────────────────
+
+
+def test_client_quarantines_corrupt_segment_on_first_open(tmp_path, monkeypatch):
+    """The instance ``_client()`` path must run ``quarantine_stale_hnsw``
+    on first open, mirroring the ``make_client()`` static helper. Before
+    PR #1173's wiring was extended here, CLI mining / search / repair /
+    status all skipped the quarantine pass and would SIGSEGV on a stale
+    HNSW segment (#1121, #1132, #1263)."""
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=_CORRUPT_META,
+    )
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    backend = ChromaBackend()
+    try:
+        backend._client(str(palace))
+    finally:
+        backend.close()
+
+    assert not seg.exists(), "_client() should have quarantined the corrupt segment"
+    drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
+    assert len(drift_dirs) == 1
+
+
+def test_client_quarantines_only_on_first_call_per_palace(tmp_path, monkeypatch):
+    """Repeated ``_client()`` calls for the same palace re-run quarantine
+    at most once — the ``_quarantined_paths`` gate prevents runtime
+    thrash on hot paths (``_client()`` is hit on every backend op)."""
+    palace_path = str(tmp_path / "palace")
+    os.makedirs(palace_path, exist_ok=True)
+    (Path(palace_path) / "chroma.sqlite3").write_text("")
+
+    monkeypatch.setattr(ChromaBackend, "_quarantined_paths", set())
+
+    calls: list[str] = []
+
+    def _spy(path, stale_seconds=300.0):
+        calls.append(path)
+        return []
+
+    monkeypatch.setattr("mempalace.backends.chroma.quarantine_stale_hnsw", _spy)
+
+    backend = ChromaBackend()
+    try:
+        backend._client(palace_path)
+        backend._client(palace_path)
+        backend._client(palace_path)
+    finally:
+        backend.close()
+
+    assert calls == [palace_path], (
+        "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
+    )
+
+
 # ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──
 
 
From e9222b4c7b98bc25942008942a96c5c9e2784795 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:57:09 -0300
Subject: [PATCH 025/127] fix(mcp): case-insensitive agent name in
 diary_write/diary_read (#1243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`tool_diary_write` stored the `agent` metadata verbatim after `sanitize_name`
(which preserves case), while `tool_diary_read` filtered by exact match —
so writing as "Claude" and reading as "claude" silently returned zero rows.

Both endpoints now lowercase `agent_name` immediately after sanitization.
The default per-agent wing slug is also stable across casings since it's
derived from the same normalized form.

Behavior change: entries written prior to this fix under mixed-case agent
names will not match the new lowercase filter; documented under v3.3.5
in CHANGELOG with a `mempalace repair` pointer.

Adds a regression test (`test_diary_read_case_insensitive_agent`) and
updates the existing `test_diary_write_and_read` to assert the new
lowercase agent identity.

Closes #1243

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md             |  8 ++++++
 mempalace/mcp_server.py  | 17 +++++++++---
 tests/test_mcp_server.py | 59 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 41dfaac..d3982fe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ---
 
+## [3.3.5] — unreleased
+
+### Bug Fixes
+
+- **`mempalace_diary_read` silently dropped entries on agent-name case mismatch.** `tool_diary_write` stored the `agent` metadata verbatim after `sanitize_name`, which preserves case, while `tool_diary_read` filtered by exact match. Writing as `"Claude"` and reading as `"claude"` (or vice-versa) returned zero rows. Both endpoints now lowercase `agent_name` immediately after sanitization, so reads are case-insensitive and the default per-agent wing slug is stable across casings. **Behavior change:** entries written prior to this fix under mixed-case agent names will not match the new lowercase filter; run `mempalace repair` if you need to migrate legacy diary metadata. (#1243)
+
+---
+
 ## [3.3.4] — unreleased
 
 ### Added
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 13654f6..7269376 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -677,7 +677,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
             "vector_disabled": True,
             "vector_disabled_reason": _vector_disabled_reason,
             "hint": (
-                "duplicate detection requires vector search; run " "`mempalace repair` to restore"
+                "duplicate detection requires vector search; run `mempalace repair` to restore"
             ),
         }
     try:
@@ -1133,9 +1133,13 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general", wing:
 
     This is the agent's personal journal — observations, thoughts,
     what it worked on, what it noticed, what it thinks matters.
+
+    Note: ``agent_name`` is normalized to lowercase before storage so
+    that diary reads are case-insensitive (see #1243). "Claude",
+    "claude", and "CLAUDE" all resolve to the same agent.
     """
     try:
-        agent_name = sanitize_name(agent_name, "agent_name")
+        agent_name = sanitize_name(agent_name, "agent_name").lower()
         entry = sanitize_content(entry)
         topic = sanitize_name(topic, "topic")
     except ValueError as e:
@@ -1144,7 +1148,7 @@ def tool_diary_write(agent_name: str, entry: str, topic: str = "general", wing:
     if wing:
         wing = sanitize_name(wing)
     else:
-        wing = f"wing_{agent_name.lower().replace(' ', '_')}"
+        wing = f"wing_{agent_name.replace(' ', '_')}"
     room = "diary"
     col = _get_collection(create=True)
     if not col:
@@ -1209,9 +1213,14 @@ def tool_diary_read(agent_name: str, last_n: int = 10, wing: str = ""):
     written to. Diary writes from hooks land in project-derived wings
     (``wing_<project>``), so requiring a specific wing on read would
     silo those entries from agent-initiated reads.
+
+    Note: ``agent_name`` is normalized to lowercase before filtering so
+    that reads are case-insensitive (see #1243). Entries written under
+    pre-fix mixed-case agent names will not match the lowercase filter;
+    use ``mempalace repair`` to migrate legacy data if needed.
     """
     try:
-        agent_name = sanitize_name(agent_name, "agent_name")
+        agent_name = sanitize_name(agent_name, "agent_name").lower()
         if wing:
             wing = sanitize_name(wing)
     except ValueError as e:
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index f8148af..b0f6c29 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -476,9 +476,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert (
-            result1["drawer_id"] != result2["drawer_id"]
-        ), "Documents with shared header but different content must have distinct drawer IDs"
+        assert result1["drawer_id"] != result2["drawer_id"], (
+            "Documents with shared header but different content must have distinct drawer IDs"
+        )
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -682,7 +682,8 @@ def test_diary_write_and_read(self, monkeypatch, config, palace_path, kg):
             topic="architecture",
         )
         assert w["success"] is True
-        assert w["agent"] == "TestAgent"
+        # agent_name is normalized to lowercase on write (#1243).
+        assert w["agent"] == "testagent"
 
         r = tool_diary_read(agent_name="TestAgent")
         assert r["total"] == 1
@@ -774,6 +775,50 @@ def test_diary_read_empty_wing_spans_all_wings(self, monkeypatch, config, palace
         assert r_scoped["total"] == 1
         assert r_scoped["entries"][0]["content"] == "project-wing entry"
 
+    def test_diary_read_case_insensitive_agent(self, monkeypatch, config, palace_path, kg):
+        """Regression for #1243: diary_read must be case-insensitive over
+        agent_name. Writing as "Claude" and reading as "claude" (or vice
+        versa) must surface the same entries — sanitize_name preserved
+        case, which silently dropped reads when the agent name's casing
+        differed from the write."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace.mcp_server import tool_diary_read, tool_diary_write
+
+        # Write as "Claude" → read as "claude" should match.
+        w1 = tool_diary_write(
+            agent_name="Claude",
+            entry="entry written as Claude",
+            topic="general",
+        )
+        assert w1["success"]
+
+        r1 = tool_diary_read(agent_name="claude")
+        assert "entries" in r1, r1
+        contents1 = {e["content"] for e in r1["entries"]}
+        assert "entry written as Claude" in contents1
+
+        # Write as "CLAUDE" → read as "Claude" should also match the
+        # same agent. After normalization both writes target the same
+        # lowercase agent identity, so both entries are returned.
+        w2 = tool_diary_write(
+            agent_name="CLAUDE",
+            entry="entry written as CLAUDE",
+            topic="general",
+        )
+        assert w2["success"]
+
+        r2 = tool_diary_read(agent_name="Claude")
+        contents2 = {e["content"] for e in r2["entries"]}
+        assert "entry written as Claude" in contents2
+        assert "entry written as CLAUDE" in contents2
+
+        # The stored agent metadata is the lowercase form, and the
+        # default wing is derived from that lowercase form too.
+        assert w1["agent"] == "claude"
+        assert w2["agent"] == "claude"
+
 
 # ── Cache Invalidation (inode/mtime) ──────────────────────────────────
 
@@ -960,9 +1005,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert (
-                "embedding_function" in kwargs
-            ), f"missing embedding_function= in chromadb call: {kwargs}"
+            assert "embedding_function" in kwargs, (
+                f"missing embedding_function= in chromadb call: {kwargs}"
+            )
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From 4b0fc444515f8efb5bceb41cec75f5db0807e297 Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:58:45 -0300
Subject: [PATCH 026/127] style: ruff format cli.py (#1244)

CI requires ruff format --check on the whole touched file. Pre-existing drift, no logic change.
---
 mempalace/cli.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index d47f38e..d57fcc8 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -310,8 +310,7 @@ def cmd_init(args):
                 )
         except LLMError as e:
             print(
-                f"  LLM init failed ({e}). "
-                f"Running heuristics-only — pass --no-llm to silence this."
+                f"  LLM init failed ({e}). Running heuristics-only — pass --no-llm to silence this."
             )
 
     # Pass 0: detect whether the corpus is AI-dialogue. Writes

From b4a9f2adf21141a4dbf51ef1835e4c4b2488bba7 Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:58:57 -0300
Subject: [PATCH 027/127] style: ruff format touched files (PR #1322)

CI requires whole-file format on touched files; pre-existing drift only.
---
 tests/test_backends.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_backends.py b/tests/test_backends.py
index 8364dc7..5a4eb4b 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -730,9 +730,9 @@ def _spy(path, stale_seconds=300.0):
     ChromaBackend.make_client(palace_path)
     ChromaBackend.make_client(palace_path)
 
-    assert calls == [
-        palace_path
-    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    assert calls == [palace_path], (
+        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
+    )
 
 
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):

From 6ffbf6ffc3a1d0828f0e6c525b48d0b4541ab5f7 Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 22:59:50 -0300
Subject: [PATCH 028/127] style: ruff format test_mcp_server.py (PR #1320)

---
 tests/test_mcp_server.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 2e769c2..2cae421 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -476,9 +476,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert result1["drawer_id"] != result2["drawer_id"], (
-            "Documents with shared header but different content must have distinct drawer IDs"
-        )
+        assert (
+            result1["drawer_id"] != result2["drawer_id"]
+        ), "Documents with shared header but different content must have distinct drawer IDs"
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1044,9 +1044,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert "embedding_function" in kwargs, (
-                f"missing embedding_function= in chromadb call: {kwargs}"
-            )
+            assert (
+                "embedding_function" in kwargs
+            ), f"missing embedding_function= in chromadb call: {kwargs}"
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From 2857948c1ead60903aeba2228fd6911ab4efcbfe Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 23:00:07 -0300
Subject: [PATCH 029/127] style: ruff format tests/test_cli.py (PR #1319)

---
 tests/test_cli.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 74521e6..7a7deba 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -930,9 +930,9 @@ def test_cmd_compress_stores_results(mock_config_cls, capsys):
     # Verify the compress output goes to the closets collection so that
     # palace.get_closets_collection() / searcher can read it back (#1244).
     (call_args, _kwargs) = mock_backend.get_or_create_collection.call_args
-    assert call_args[1] == "mempalace_closets", (
-        f"compress should write to mempalace_closets, got {call_args[1]!r}"
-    )
+    assert (
+        call_args[1] == "mempalace_closets"
+    ), f"compress should write to mempalace_closets, got {call_args[1]!r}"
     assert "mempalace_closets" in out
 
 
From f854d86d2f378ca9e926080c2e329acca7d6b170 Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 23:00:08 -0300
Subject: [PATCH 030/127] style: ruff format tests/test_backends.py (PR #1322)

---
 tests/test_backends.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_backends.py b/tests/test_backends.py
index 5a4eb4b..4ddfe12 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -730,9 +730,9 @@ def _spy(path, stale_seconds=300.0):
     ChromaBackend.make_client(palace_path)
     ChromaBackend.make_client(palace_path)
 
-    assert calls == [palace_path], (
-        "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
-    )
+    assert calls == [
+        palace_path
+    ], "quarantine_stale_hnsw should fire once per palace per process, not on every reconnect"
 
 
 def test_make_client_quarantines_each_palace_independently(tmp_path, monkeypatch):
@@ -820,9 +820,9 @@ def _spy(path, stale_seconds=300.0):
     finally:
         backend.close()
 
-    assert calls == [palace_path], (
-        "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
-    )
+    assert (
+        calls == [palace_path]
+    ), "quarantine_stale_hnsw should fire once per palace per process from _client(), not on every call"
 
 
 # ── _pin_hnsw_threads (per-process retrofit, separate from this PR's gate) ──

From 2397481158bd47e4657ef10f852192e93426efcb Mon Sep 17 00:00:00 2001
From: igorls <4753812+igorls@users.noreply.github.com>
Date: Sat, 2 May 2026 23:00:10 -0300
Subject: [PATCH 031/127] style: ruff format tests/test_mcp_server.py (PR
 #1323)

---
 tests/test_mcp_server.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index b0f6c29..16dad6d 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -476,9 +476,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert result1["drawer_id"] != result2["drawer_id"], (
-            "Documents with shared header but different content must have distinct drawer IDs"
-        )
+        assert (
+            result1["drawer_id"] != result2["drawer_id"]
+        ), "Documents with shared header but different content must have distinct drawer IDs"
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1005,9 +1005,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert "embedding_function" in kwargs, (
-                f"missing embedding_function= in chromadb call: {kwargs}"
-            )
+            assert (
+                "embedding_function" in kwargs
+            ), f"missing embedding_function= in chromadb call: {kwargs}"
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From b2f259c25304e89bc19bf767dd8c971251ba7026 Mon Sep 17 00:00:00 2001
From: icciAaron <asalsitz@icci.com>
Date: Sun, 19 Apr 2026 15:01:28 -0400
Subject: [PATCH 032/127] fix(mcp): omit palace_path from tool_status responses
 (+ docs)

The MCP `mempalace_status` tool was returning the server's absolute
`_config.palace_path` to any connected client on both the main
(ChromaDB-backed) path and the sqlite fallback path that runs when
HNSW divergence is detected (#1222). On a single-user local deployment
this is self-disclosure, but in nested-agent or multi-server MCP
topologies the client is a separate trust domain and the absolute
path has no documented client-side use.

Clients that legitimately need the palace path continue to have three
documented channels: the `MEMPALACE_PALACE_PATH` env var (primary) or
its legacy `MEMPAL_PALACE_PATH` alias, the `~/.mempalace/config.json`
file, and the `--palace` CLI flag on most subcommands.

Also corrects stale docs that claimed `mempalace_reconnect` returned a
`palace_path` field; the code returns `{success, message, drawers,
vector_disabled[, vector_disabled_reason]}` on success, plus a no-palace
shape and an exception shape.

- mempalace/mcp_server.py: drop palace_path from tool_status() and
  _tool_status_via_sqlite() result dicts
- website/reference/mcp-tools.md: update documented return shapes for
  mempalace_status (fix) and mempalace_reconnect (stale-docs correction)

Authored-by: Aaron Salsitz (ICCI LLC, @icciaaron). Claude Code was used
as an authoring and review-orchestration tool, with human-in-the-loop
oversight at every step: Aaron wrote the prompts, reviewed each draft,
called for three independent review passes (drafting / post-rebase
technical / CISA-aligned disclosure-leak), and verified the final patch
behavior before commit.
---
 mempalace/mcp_server.py        | 2 --
 website/reference/mcp-tools.md | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 13654f6..4aab316 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -454,7 +454,6 @@ def _tool_status_via_sqlite() -> dict:
         "total_drawers": total,
         "wings": wings,
         "rooms": rooms,
-        "palace_path": _config.palace_path,
         "protocol": PALACE_PROTOCOL,
         "aaak_dialect": AAAK_SPEC,
         "vector_disabled": True,
@@ -493,7 +492,6 @@ def tool_status():
         "total_drawers": count,
         "wings": wings,
         "rooms": rooms,
-        "palace_path": _config.palace_path,
         "protocol": PALACE_PROTOCOL,
         "aaak_dialect": AAAK_SPEC,
     }
diff --git a/website/reference/mcp-tools.md b/website/reference/mcp-tools.md
index f951fe1..671225a 100644
--- a/website/reference/mcp-tools.md
+++ b/website/reference/mcp-tools.md
@@ -10,7 +10,7 @@ Palace overview: total drawers, wing and room counts, AAAK spec, and memory prot
 
 **Parameters:** None
 
-**Returns:** `{ total_drawers, wings, rooms, palace_path, protocol, aaak_dialect }`
+**Returns:** `{ total_drawers, wings, rooms, protocol, aaak_dialect }`
 
 ---
 
@@ -378,4 +378,4 @@ Force a reconnect to the palace database. Use this after external scripts or CLI
 
 **Parameters:** None
 
-**Returns:** `{ success, palace_path }`
+**Returns:** `{ success, message, drawers, vector_disabled[, vector_disabled_reason] }` (on no-palace: `{ success: false, message, drawers, vector_disabled }`; on exception: `{ success: false, error }`)

From 7fc260f75236551707fa3932adb994d4cb3f6332 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 3 May 2026 05:48:41 -0300
Subject: [PATCH 033/127] fix(mcp): basename source_file in tool_get_drawer
 responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MCP `mempalace_get_drawer` tool returned the entire raw drawer
metadata blob to any connected client, and the `source_file` field
in that blob is the absolute filesystem path written by the miners
(`miner.py`, `convo_miner.py` — `source_file = str(filepath)`). On
a single-user local deployment this is self-disclosure, but in
nested-agent or multi-server MCP topologies the client is a separate
trust domain and the host's directory layout has no documented
client-side use.

Mirror the mitigation that `searcher.search_memories()` already applies
on its own return path: reduce `source_file` to its basename via
`Path(source_file).name` before handing the metadata to the client.
Citations still work — the directory layout does not leak.

Companion to #1 (omit palace_path from tool_status). Same threat class,
different surface:

- mempalace_status — palace dir path     → fixed in #1
- mempalace_get_drawer — per-drawer source_file path → this PR

Other read tools were audited and do not leak host paths:
- mempalace_search    — already basenames source_file
- mempalace_list_drawers — returns wing/room/preview only
- mempalace_diary_read   — date/timestamp/topic/content only
- mempalace_reconnect    — success/message/drawers only
- mempalace_kg_*         — entity/predicate strings, counts
- mempalace_check_duplicate — wing/room/preview only

Changes:
- mempalace/mcp_server.py: tool_get_drawer() now basenames metadata.source_file
- tests/test_mcp_server.py: regression test asserting the absolute path
  and its parent directory do not appear anywhere in the response
- website/reference/mcp-tools.md: clarify the documented return shape
---
 mempalace/mcp_server.py        | 15 ++++++++++---
 tests/test_mcp_server.py       | 39 ++++++++++++++++++++++++++++++++++
 website/reference/mcp-tools.md |  2 +-
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 4aab316..b010ab9 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -912,12 +912,21 @@ def tool_get_drawer(drawer_id: str):
             return {"error": f"Drawer not found: {drawer_id}"}
         meta = result["metadatas"][0]
         doc = result["documents"][0]
+        # source_file is the absolute filesystem path written by the
+        # miners. Reduce to its basename before handing it to the MCP
+        # client — same threat model as the palace_path leak fix:
+        # nested-agent / multi-server topologies treat the client as a
+        # separate trust domain. Basename preserves citation utility.
+        # Mirrors the searcher.search_memories() return shape.
+        safe_meta = dict(meta) if meta else {}
+        if safe_meta.get("source_file"):
+            safe_meta["source_file"] = Path(safe_meta["source_file"]).name
         return {
             "drawer_id": drawer_id,
             "content": doc,
-            "wing": meta.get("wing", ""),
-            "room": meta.get("room", ""),
-            "metadata": meta,
+            "wing": safe_meta.get("wing", ""),
+            "room": safe_meta.get("room", ""),
+            "metadata": safe_meta,
         }
     except Exception as e:
         return {"error": str(e)}
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index f8148af..0e37e35 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -531,6 +531,45 @@ def test_get_drawer_not_found(self, monkeypatch, config, palace_path, seeded_col
         result = tool_get_drawer("nonexistent_drawer")
         assert "error" in result
 
+    def test_get_drawer_does_not_leak_absolute_source_file_path(
+        self, monkeypatch, config, palace_path, collection, kg
+    ):
+        """tool_get_drawer must not expose the absolute filesystem path
+        that the miners write into ``source_file``. Same threat class as
+        the palace_path leak in mempalace_status: in nested-agent or
+        multi-server MCP topologies the client is a separate trust
+        domain, and the directory layout of the host has no documented
+        client-side use. Basename is enough for citation."""
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        secret_dir = "/private/home/alice/secret-research/2026"
+        absolute_source = f"{secret_dir}/notes.md"
+        collection.add(
+            ids=["drawer_leak_probe"],
+            documents=["verbatim drawer body for leak probe"],
+            metadatas=[
+                {
+                    "wing": "research",
+                    "room": "notes",
+                    "source_file": absolute_source,
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-03T00:00:00",
+                }
+            ],
+        )
+
+        from mempalace.mcp_server import tool_get_drawer
+
+        result = tool_get_drawer("drawer_leak_probe")
+        assert result["drawer_id"] == "drawer_leak_probe"
+        assert result["metadata"]["source_file"] == "notes.md"
+        # Defense-in-depth: no field anywhere in the response should
+        # contain the absolute path or its parent directory.
+        serialized = json.dumps(result)
+        assert absolute_source not in serialized
+        assert secret_dir not in serialized
+
     def test_list_drawers(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
         from mempalace.mcp_server import tool_list_drawers
diff --git a/website/reference/mcp-tools.md b/website/reference/mcp-tools.md
index 671225a..6866aa6 100644
--- a/website/reference/mcp-tools.md
+++ b/website/reference/mcp-tools.md
@@ -122,7 +122,7 @@ Fetch a single drawer by ID — returns full content and metadata.
 |-----------|------|----------|-------------|
 | `drawer_id` | string | **Yes** | ID of the drawer to fetch |
 
-**Returns:** `{ drawer: { id, wing, room, content, ... } }`
+**Returns:** `{ drawer_id, content, wing, room, metadata }` where `metadata.source_file`, when present, is the basename only — the absolute path written by the miners is reduced before the dict is returned to MCP clients.
 
 ---
 

From 3eb7980e5540cb68a756bfb582cf4cccbf1875c8 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 3 May 2026 06:09:10 -0300
Subject: [PATCH 034/127] fix(searcher): address Copilot review on #1306
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Dedup union candidates by (full_path, chunk_index), not basename —
  two files sharing a basename in different dirs no longer collide,
  and a vector hit on chunk N of a file no longer blocks BM25 from
  contributing chunk M of the same file.
- Validate candidate_strategy at the top of search_memories so invalid
  values fail consistently, not only when the call routes through the
  vector path.
- Trim hits back to n_results after the union+rerank pool grows;
  preserves the existing search_memories size contract that the MCP
  limit parameter is built on.
- Skip BM25-only injection when max_distance > 0.0; BM25-only
  candidates carry distance=None and would silently bypass the
  caller's strict vector-distance threshold.

Adds 4 tests covering: validation under vector_disabled, n_results
trim, max_distance honoring, and basename-collision dedup.
---
 mempalace/searcher.py                | 114 ++++++++++++++++++++++-----
 tests/test_hybrid_candidate_union.py | 101 ++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 18 deletions(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index 7a46158..d615623 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -381,6 +381,7 @@ def _bm25_only_via_sqlite(
     room: str = None,
     n_results: int = 5,
     max_candidates: int = 500,
+    _include_internal: bool = False,
 ) -> dict:
     """BM25-only search reading drawers directly from chroma.sqlite3.
 
@@ -518,17 +519,25 @@ def _bm25_only_via_sqlite(
             continue
         if room and meta.get("room") != room:
             continue
+        full_source = meta.get("source_file", "") or ""
         candidates.append(
             {
                 "text": d["text"],
                 "wing": meta.get("wing", "unknown"),
                 "room": meta.get("room", "unknown"),
-                "source_file": Path(meta.get("source_file", "?") or "?").name,
+                "source_file": Path(full_source).name if full_source else "?",
                 "created_at": meta.get("filed_at", "unknown"),
                 # No vector distance available in BM25-only mode.
                 "similarity": None,
                 "distance": None,
                 "matched_via": "bm25_sqlite",
+                # Internal: full path + chunk_index let callers (notably
+                # candidate_strategy="union") dedupe at chunk granularity
+                # rather than basename — two files in different directories
+                # may share a basename, and one source_file is split across
+                # multiple chunks. Stripped before this helper returns.
+                "_source_file_full": full_source,
+                "_chunk_index": meta.get("chunk_index"),
             }
         )
 
@@ -543,6 +552,12 @@ def _bm25_only_via_sqlite(
     hits = candidates[:n_results]
     for h in hits:
         h.pop("_score", None)
+        # Strip internal fields by default so the public BM25-only fallback
+        # response stays clean. Callers that need chunk-precise dedup
+        # (notably the union-merge path) opt in via _include_internal.
+        if not _include_internal:
+            h.pop("_source_file_full", None)
+            h.pop("_chunk_index", None)
 
     return {
         "query": query,
@@ -561,17 +576,33 @@ def _merge_bm25_union_candidates(
     wing: str,
     room: str,
     n_results: int,
+    max_distance: float = 0.0,
 ) -> None:
     """Append top-K BM25-only candidates from sqlite into ``hits`` in place.
 
     Used by ``search_memories(..., candidate_strategy="union")`` to widen
     the rerank pool's *source* (not just its size) — vector-only candidate
     selection skips docs whose embeddings are far from the query even when
-    BM25 signal is strong. We dedupe against existing hits by ``source_file``
-    so vector-side entries (which carry real distance values) win on
-    collisions; BM25-only additions are marked with ``distance=None`` so
-    ``_hybrid_rank`` scores them on BM25 contribution alone.
+    BM25 signal is strong.
+
+    Dedup is chunk-precise: the key is ``(_source_file_full, _chunk_index)``
+    so two files sharing a basename in different directories don't collide,
+    and a vector hit on chunk N of a file doesn't block BM25 from
+    contributing chunk M of the same file. Falls back to ``source_file``
+    only when full-path/chunk metadata is absent.
+
+    BM25-only additions carry ``distance=None`` so ``_hybrid_rank`` scores
+    them on BM25 contribution alone.
+
+    When ``max_distance > 0.0`` (a strict vector-distance threshold is
+    set), BM25-only candidates are skipped entirely — they have no vector
+    distance to satisfy the threshold, and silently injecting them would
+    break the existing ``max_distance`` guarantee that hybrid results lie
+    within the requested vector-distance bound.
     """
+    if max_distance > 0.0:
+        return
+
     try:
         bm25_extra = _bm25_only_via_sqlite(
             query,
@@ -579,21 +610,32 @@ def _merge_bm25_union_candidates(
             wing=wing,
             room=room,
             n_results=n_results * 3,
+            _include_internal=True,
         ).get("results", [])
     except Exception:
         logger.debug("candidate_strategy=union: BM25 fetch failed", exc_info=True)
         return
 
-    seen_sources = {h.get("source_file") for h in hits}
+    def _dedup_key(entry: dict):
+        full = entry.get("_source_file_full")
+        ci = entry.get("_chunk_index")
+        if full and ci is not None:
+            return (full, ci)
+        # Fall back to basename only when richer metadata is missing —
+        # avoids silently dropping candidates on legacy data while still
+        # giving chunk-precise dedup whenever the metadata is present.
+        return entry.get("source_file")
+
+    seen = {_dedup_key(h) for h in hits}
     for bh in bm25_extra:
-        key = bh.get("source_file")
-        if not key or key == "?" or key in seen_sources:
+        key = _dedup_key(bh)
+        if not key or key == "?" or key in seen:
             continue
         bh["distance"] = None
         bh["effective_distance"] = None
         bh["closet_boost"] = 0.0
         hits.append(bh)
-        seen_sources.add(key)
+        seen.add(key)
 
 
 # Strategy dispatch — keeps search_memories' branch count under the
@@ -605,6 +647,19 @@ def _merge_bm25_union_candidates(
 }
 
 
+def _validate_candidate_strategy(strategy: str) -> None:
+    """Raise ``ValueError`` for unknown strategies.
+
+    Called eagerly at the top of ``search_memories`` so invalid values
+    fail consistently regardless of whether the call routes through the
+    vector path, the BM25-only fallback, or returns an early error dict.
+    """
+    if strategy not in _CANDIDATE_MERGERS:
+        raise ValueError(
+            f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, got {strategy!r}"
+        )
+
+
 def _apply_candidate_strategy(
     strategy: str,
     hits: list,
@@ -613,18 +668,16 @@ def _apply_candidate_strategy(
     wing: str,
     room: str,
     n_results: int,
+    max_distance: float = 0.0,
 ) -> None:
     """Dispatch to the registered merger for ``strategy``.
 
-    Raises ``ValueError`` for unknown strategies. ``"vector"`` is a no-op.
+    Strategy validity is assumed (``_validate_candidate_strategy`` runs
+    earlier); ``"vector"`` is a no-op.
     """
-    if strategy not in _CANDIDATE_MERGERS:
-        raise ValueError(
-            f"candidate_strategy must be one of {tuple(_CANDIDATE_MERGERS)}, " f"got {strategy!r}"
-        )
     merger = _CANDIDATE_MERGERS[strategy]
     if merger is not None:
-        merger(hits, query, palace_path, wing, room, n_results)
+        merger(hits, query, palace_path, wing, room, n_results, max_distance=max_distance)
 
 
 def search_memories(
@@ -669,7 +722,16 @@ def search_memories(
               by scenario descriptions). Adds one sqlite open + FTS5 MATCH
               per query; perf cost is small but unmeasured at corpus scale.
               Opt in until the cost is characterized.
+
+              When ``max_distance > 0.0`` is also set, BM25-only candidates
+              are skipped — they have no vector distance and would silently
+              violate the requested distance threshold.
     """
+    # Validate the strategy eagerly so invalid values fail the same way
+    # regardless of whether the call routes through the vector path or
+    # the BM25-only fallback below.
+    _validate_candidate_strategy(candidate_strategy)
+
     if vector_disabled:
         return _bm25_only_via_sqlite(
             query,
@@ -848,10 +910,26 @@ def search_memories(
     # Candidate strategy hook: optionally widen the rerank pool's *source*
     # before ranking. Default ("vector") is a no-op; "union" merges top-K
     # BM25 candidates from sqlite. See `_apply_candidate_strategy`.
-    _apply_candidate_strategy(candidate_strategy, hits, query, palace_path, wing, room, n_results)
+    # ``max_distance`` is forwarded so union mode can refuse to inject
+    # BM25-only (distance=None) candidates that would silently bypass the
+    # caller's strict distance threshold.
+    _apply_candidate_strategy(
+        candidate_strategy,
+        hits,
+        query,
+        palace_path,
+        wing,
+        room,
+        n_results,
+        max_distance=max_distance,
+    )
 
-    # BM25 hybrid re-rank within the final candidate set.
-    hits = _hybrid_rank(hits, query)
+    # BM25 hybrid re-rank within the final candidate set, then trim back
+    # to the requested size. Without the trim, ``candidate_strategy="union"``
+    # would return up to 4× ``n_results`` (vector hits + BM25 union pool),
+    # breaking the existing ``search_memories`` size contract that the MCP
+    # ``limit`` parameter is built on.
+    hits = _hybrid_rank(hits, query)[:n_results]
     for h in hits:
         h.pop("_sort_key", None)
         h.pop("_source_file_full", None)
diff --git a/tests/test_hybrid_candidate_union.py b/tests/test_hybrid_candidate_union.py
index 97cf4d1..feca81e 100644
--- a/tests/test_hybrid_candidate_union.py
+++ b/tests/test_hybrid_candidate_union.py
@@ -113,6 +113,107 @@ def test_invalid_candidate_strategy_raises(self, tmp_path):
         with pytest.raises(ValueError, match="candidate_strategy"):
             search_memories("anything", palace, n_results=5, candidate_strategy="bogus")
 
+    def test_invalid_strategy_raises_even_when_vector_disabled(self, tmp_path):
+        """Validation must happen before the ``vector_disabled`` early return —
+        invalid values must fail consistently regardless of routing."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        import pytest
+
+        with pytest.raises(ValueError, match="candidate_strategy"):
+            search_memories(
+                "anything",
+                palace,
+                n_results=5,
+                vector_disabled=True,
+                candidate_strategy="bogus",
+            )
+
+    def test_union_respects_n_results_limit(self, tmp_path):
+        """When the merged candidate set is larger than ``n_results``, the
+        result must be trimmed back to the requested size — the MCP
+        ``limit`` contract depends on this invariant."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # 4-doc corpus, n_results=2 → union pool can grow to ~8 candidates,
+        # rerank reorders them, but final list must respect the cap.
+        result = search_memories(_NARRATIVE_QUERY, palace, n_results=2, candidate_strategy="union")
+        assert (
+            len(result["results"]) <= 2
+        ), f"union must trim to n_results=2; got {len(result['results'])} results"
+
+    def test_union_skipped_when_max_distance_set(self, tmp_path):
+        """``max_distance`` is a vector-distance threshold; BM25-only
+        candidates have ``distance=None`` and cannot satisfy it. Union
+        must not silently inject them when a strict threshold is set,
+        otherwise the existing ``max_distance`` guarantee regresses."""
+        palace = str(tmp_path / "palace")
+        _seed_drawers(palace)
+        # Sanity: without max_distance, union surfaces the BM25-strong doc.
+        unfiltered = search_memories(
+            _NARRATIVE_QUERY, palace, n_results=5, candidate_strategy="union"
+        )
+        assert "brand_voice_D4.md" in {h["source_file"] for h in unfiltered["results"]}
+
+        # With a tight max_distance, union must NOT inject BM25-only hits —
+        # every returned hit must have a real (non-None) distance.
+        filtered = search_memories(
+            _NARRATIVE_QUERY,
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+            max_distance=0.5,
+        )
+        for h in filtered["results"]:
+            assert h.get("distance") is not None, (
+                f"union under max_distance must not inject BM25-only "
+                f"(distance=None) candidates; offending hit: {h}"
+            )
+            assert h["distance"] <= 0.5, f"hit violates max_distance=0.5: distance={h['distance']}"
+
+    def test_union_dedup_is_chunk_precise_not_basename(self, tmp_path):
+        """Two files with the same basename in different directories must
+        not collide — union must dedup on full path (or chunk-level key),
+        not on basename alone. Otherwise a BM25-strong README from one
+        directory silently shadows a BM25-strong README from another.
+        """
+        palace = str(tmp_path / "palace")
+        col = get_collection(palace, create=True)
+        col.upsert(
+            ids=["A_README", "B_README", "narrative"],
+            documents=[
+                # Both README files share the basename README.md but live
+                # in different directories. Each contains distinctive
+                # terminology a query might surface via BM25.
+                "PROJECT ALPHA: configuration for the Frobnitz subsystem. "
+                "Set FROBNITZ_TIMEOUT=30 to enable widget rotation.",
+                "PROJECT BETA: configuration for the Wibble subsystem. "
+                "Set WIBBLE_THRESHOLD=0.5 to enable signal smoothing.",
+                "Engineers occasionally chat about how the legacy "
+                "subsystems all need their config knobs tweaked.",
+            ],
+            metadatas=[
+                {"wing": "code", "room": "docs", "source_file": "alpha/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "beta/README.md"},
+                {"wing": "code", "room": "docs", "source_file": "chat.md"},
+            ],
+        )
+        # Query that hits BM25 for BOTH READMEs (distinct vocab from each).
+        # Vector-only might pick the chat doc as semantically "closest";
+        # union must surface both READMEs without basename collision.
+        result = search_memories(
+            "FROBNITZ_TIMEOUT WIBBLE_THRESHOLD configuration",
+            palace,
+            n_results=5,
+            candidate_strategy="union",
+        )
+        sources = [h["source_file"] for h in result["results"]]
+        readme_count = sum(1 for s in sources if s == "README.md")
+        assert readme_count >= 2, (
+            f"union must surface both README.md files from different dirs "
+            f"(basename collision would drop one); got sources={sources}"
+        )
+
 
 class TestHybridRankTolerantOfMissingDistance:
     """``_hybrid_rank`` accepts ``distance=None`` — required for BM25-only

From 0e65c54978463102736bdbead64916b4845d431e Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 3 May 2026 06:28:12 -0300
Subject: [PATCH 035/127] =?UTF-8?q?docs(mcp):=20drop=20=C2=A75.5=20from=20?=
 =?UTF-8?q?kg=5Fadd=20docstring/schema?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The repo's anti-jargon meta-test bans §N markers outside the
sources/backends allowlist. mcp_server.py isn't allowlisted, so the
"RFC 002 §5.5" references added in this PR turned the test red.
Trim to "RFC 002" — section number isn't load-bearing for the description.
---
 mempalace/mcp_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 1862737..7a979e3 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1075,7 +1075,7 @@ def tool_kg_add(
     All temporal and provenance fields are optional. ``valid_to`` lets callers
     backfill historical facts with a known end date in a single call (instead
     of a separate ``kg_invalidate``). ``source_file`` and ``source_drawer_id``
-    are RFC 002 §5.5 provenance fields populated by adapters / bulk importers.
+    are RFC 002 provenance fields populated by adapters / bulk importers.
 
     TODO(#1283): once the ISO-8601 validation PR lands, wire ``validate_iso_date``
     over ``valid_from`` / ``valid_to`` here so malformed dates fail fast at the
@@ -1509,7 +1509,7 @@ def tool_reconnect():
                 },
                 "source_drawer_id": {
                     "type": "string",
-                    "description": "Drawer ID the fact was extracted from (optional, RFC 002 §5.5 provenance)",
+                    "description": "Drawer ID the fact was extracted from (optional, RFC 002 provenance)",
                 },
             },
             "required": ["subject", "predicate", "object"],

From a91b7ee5c2ba439321bf3c835b698bf19c6d8b63 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 3 May 2026 06:27:37 -0300
Subject: [PATCH 036/127] test(cli): prime monkeypatch undo so palace env
 doesn't leak

monkeypatch.delenv(name, raising=False) on a missing key registers no
undo entry, so the env var cmd_init writes leaked into test_config_from_file
on Python 3.13 / Windows / macOS.

Prime the slot with setenv before delenv so teardown rolls back the write.
---
 tests/test_cli.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index c52e67f..04442d0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -187,9 +187,15 @@ def test_cmd_init_honors_palace_flag(tmp_path, monkeypatch):
     palace = tmp_path / "custom_palace"
 
     # Make sure no leftover env var from another test leaks in — we want to
-    # verify that --palace ALONE drives the resolution.
-    monkeypatch.delenv("MEMPALACE_PALACE_PATH", raising=False)
-    monkeypatch.delenv("MEMPAL_PALACE_PATH", raising=False)
+    # verify that --palace ALONE drives the resolution. Prime monkeypatch's
+    # undo list with setenv first so that the env var ``cmd_init`` writes
+    # below is rolled back at teardown (``delenv(raising=False)`` on a
+    # missing key registers no undo entry, which would leak into the next
+    # test).
+    monkeypatch.setenv("MEMPALACE_PALACE_PATH", "")
+    monkeypatch.setenv("MEMPAL_PALACE_PATH", "")
+    monkeypatch.delenv("MEMPALACE_PALACE_PATH")
+    monkeypatch.delenv("MEMPAL_PALACE_PATH")
 
     args = argparse.Namespace(
         dir=str(project),

From beac5d99547121ca04ff069dbeaac7619fc741a8 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:46:31 +0500
Subject: [PATCH 037/127] refactor(mcp): replace eager _kg with lazy per-path
 cache (#1136)

Swap the module-level KnowledgeGraph singleton for a lazy, per-path
cache keyed by the resolved sqlite path. Import no longer creates a
sqlite file as a side effect, and MCP servers started with --palace
now route KG calls to the correct tenant when MEMPALACE_PALACE_PATH
changes between calls, matching the per-call behavior of _get_client()
on the ChromaDB side.

Default-path behavior is preserved: without --palace at startup, KG
stays on DEFAULT_KG_PATH regardless of env var. The "no --palace but
env var set" case is #540's scope and is not changed here.
---
 mempalace/mcp_server.py | 52 +++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index e3e89c6..eae048b 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -46,6 +46,7 @@
 import json  # noqa: E402
 import logging  # noqa: E402
 import hashlib  # noqa: E402
+import threading  # noqa: E402
 import time  # noqa: E402
 from datetime import date, datetime  # noqa: E402
 from pathlib import Path  # noqa: E402
@@ -78,7 +79,7 @@
     follow_tunnels,
 )
 
-from .knowledge_graph import KnowledgeGraph  # noqa: E402
+from .knowledge_graph import KnowledgeGraph, DEFAULT_KG_PATH  # noqa: E402
 
 logging.basicConfig(level=logging.INFO, format="%(message)s", stream=sys.stderr)
 logger = logging.getLogger("mempalace_mcp")
@@ -103,12 +104,39 @@ def _parse_args():
     os.environ["MEMPALACE_PALACE_PATH"] = os.path.abspath(_args.palace)
 
 _config = MempalaceConfig()
-# Only override KG path when --palace is explicitly provided; otherwise use
-# KnowledgeGraph's default (~/.mempalace/knowledge_graph.sqlite3).
-if _args.palace:
-    _kg = KnowledgeGraph(db_path=os.path.join(_config.palace_path, "knowledge_graph.sqlite3"))
-else:
-    _kg = KnowledgeGraph()
+
+# Lazy per-path KG cache. Import no longer creates the sqlite file as a side
+# effect (see issue #1136). The path is resolved on each tool call so that a
+# multi-tenant host rotating MEMPALACE_PALACE_PATH between calls routes each
+# call to the correct KG file, matching the per-call behavior of _get_client()
+# on the ChromaDB side.
+_kg_by_path: dict[str, KnowledgeGraph] = {}
+_kg_cache_lock = threading.Lock()
+
+# Whether --palace was given at startup. Controls default-path resolution:
+# with the flag, KG follows _config.palace_path per call; without it, KG stays
+# on DEFAULT_KG_PATH regardless of env var (issue #540's territory, out of
+# scope here).
+_palace_flag_given: bool = bool(_args.palace)
+
+
+def _resolve_kg_path() -> str:
+    if _palace_flag_given:
+        return os.path.join(_config.palace_path, "knowledge_graph.sqlite3")
+    return DEFAULT_KG_PATH
+
+
+def _get_kg() -> KnowledgeGraph:
+    path = os.path.abspath(_resolve_kg_path())
+    kg = _kg_by_path.get(path)
+    if kg is not None:
+        return kg
+    with _kg_cache_lock:
+        kg = _kg_by_path.get(path)
+        if kg is None:
+            kg = KnowledgeGraph(db_path=path)
+            _kg_by_path[path] = kg
+    return kg
 
 
 _client_cache = None
@@ -1063,7 +1091,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
         return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
-    results = _kg.query_entity(entity, as_of=as_of, direction=direction)
+    results = _get_kg().query_entity(entity, as_of=as_of, direction=direction)
     return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
 
 
@@ -1108,7 +1136,7 @@ def tool_kg_add(
             "source_drawer_id": source_drawer_id,
         },
     )
-    triple_id = _kg.add_triple(
+    triple_id = _get_kg().add_triple(
         subject,
         predicate,
         object,
@@ -1147,7 +1175,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
             "ended": resolved_ended,
         },
     )
-    _kg.invalidate(subject, predicate, object, ended=resolved_ended)
+    _get_kg().invalidate(subject, predicate, object, ended=resolved_ended)
     return {
         "success": True,
         "fact": f"{subject} → {predicate} → {object}",
@@ -1162,13 +1190,13 @@ def tool_kg_timeline(entity: str = None):
             entity = sanitize_kg_value(entity, "entity")
         except ValueError as e:
             return {"error": str(e)}
-    results = _kg.timeline(entity)
+    results = _get_kg().timeline(entity)
     return {"entity": entity or "all", "timeline": results, "count": len(results)}
 
 
 def tool_kg_stats():
     """Knowledge graph overview: entities, triples, relationship types."""
-    return _kg.stats()
+    return _get_kg().stats()
 
 
 # ==================== AGENT DIARY ====================

From 9e730098e97c173476b9949a5eca2253e56a08a5 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:48:00 +0500
Subject: [PATCH 038/127] test(mcp): migrate _kg monkeypatches to _get_kg
 (#1136)

Direct module-attribute patching of _kg is obsolete after the lazy
cache refactor. Switch test helpers to patch _get_kg instead so the
fixture KG replaces the factory rather than a now-missing singleton.

- tests/test_mcp_server.py: _patch_mcp_server helper
- tests/benchmarks/test_mcp_bench.py: _patch_mcp_config helper
- tests/benchmarks/test_memory_profile.py: inline patch in test_tool_status_repeated_calls
---
 tests/benchmarks/test_mcp_bench.py      | 3 ++-
 tests/benchmarks/test_memory_profile.py | 3 ++-
 tests/test_mcp_server.py                | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/benchmarks/test_mcp_bench.py b/tests/benchmarks/test_mcp_bench.py
index 4e8330b..42e73ec 100644
--- a/tests/benchmarks/test_mcp_bench.py
+++ b/tests/benchmarks/test_mcp_bench.py
@@ -40,8 +40,9 @@ def _patch_mcp_config(monkeypatch, palace_path, tmp_path):
 
     import mempalace.mcp_server as mcp_mod
 
+    kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
     monkeypatch.setattr(mcp_mod, "_config", cfg)
-    monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+    monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)
 
 
 def _get_rss_mb():
diff --git a/tests/benchmarks/test_memory_profile.py b/tests/benchmarks/test_memory_profile.py
index b299b2d..047bfaa 100644
--- a/tests/benchmarks/test_memory_profile.py
+++ b/tests/benchmarks/test_memory_profile.py
@@ -84,8 +84,9 @@ def test_tool_status_repeated_calls(self, tmp_path, monkeypatch):
 
         cfg = MempalaceConfig(config_dir=str(tmp_path / "cfg"))
         monkeypatch.setattr(cfg, "_file_config", {"palace_path": palace_path})
+        kg = KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3"))
         monkeypatch.setattr(mcp_mod, "_config", cfg)
-        monkeypatch.setattr(mcp_mod, "_kg", KnowledgeGraph(db_path=str(tmp_path / "kg.sqlite3")))
+        monkeypatch.setattr(mcp_mod, "_get_kg", lambda: kg)
 
         from mempalace.mcp_server import tool_status
 
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index ec9562b..2ab2eb8 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -18,7 +18,7 @@ def _patch_mcp_server(monkeypatch, config, kg):
     from mempalace import mcp_server
 
     monkeypatch.setattr(mcp_server, "_config", config)
-    monkeypatch.setattr(mcp_server, "_kg", kg)
+    monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)
 
 
 def _get_collection(palace_path, create=False):

From c69a622a18db0a082bce8729d35c1d6a8d98efdf Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 12:53:19 +0500
Subject: [PATCH 039/127] test(mcp): add multi-tenant and lazy-init tests for
 KG (#1136)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TestKGLazyCache covers the scenarios behind the lazy per-path refactor:

- test_lazy_init_no_import_side_effect: a fresh subprocess import does
  not create ~/.mempalace/knowledge_graph.sqlite3 (what closed PR #167
  was aiming at).
- test_get_kg_returns_same_instance: two _get_kg() calls under the same
  resolved path return the same object, cache has one entry.
- test_get_kg_different_paths_different_instances: rotating env var
  produces distinct KGs.
- test_multi_tenant_env_switch: the exact scenario from #1136 — write
  under path A, query under path B returns empty, switching back to A
  sees the fact.
- test_cache_thread_safe: 16 threads racing _get_kg() end up with one
  shared instance and one cache entry.
---
 tests/test_mcp_server.py | 112 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 2ab2eb8..86d5878 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -8,6 +8,7 @@
 
 from datetime import datetime
 import json
+import os
 import sys
 
 import pytest
@@ -1143,3 +1144,114 @@ def _spy_create(self, name, **kwargs):
         for kwargs in captured["get"]:
             assert "embedding_function" in kwargs
             assert kwargs["embedding_function"] is not None
+
+
+class TestKGLazyCache:
+    """Lazy per-path KnowledgeGraph cache (issue #1136)."""
+
+    def test_lazy_init_no_import_side_effect(self, tmp_path):
+        """Importing mcp_server must not create knowledge_graph.sqlite3.
+
+        Runs in a fresh subprocess with HOME pointed at tmp_path so the
+        assertion targets a clean filesystem, independent of conftest's
+        session-level HOME patch.
+        """
+        import subprocess
+        import sys
+
+        kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        result = subprocess.run(
+            [sys.executable, "-c", "import mempalace.mcp_server"],
+            env={
+                "HOME": str(tmp_path),
+                "USERPROFILE": str(tmp_path),
+                "PATH": os.environ.get("PATH", ""),
+                "PYTHONPATH": os.environ.get("PYTHONPATH", ""),
+            },
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        assert result.returncode == 0, f"import failed: {result.stderr}"
+        assert not kg_file.exists(), f"import created sqlite file at {kg_file} as a side effect"
+
+    def test_get_kg_returns_same_instance(self, tmp_path, monkeypatch):
+        """Two calls with the same resolved path return the same KG."""
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        kg1 = mcp_server._get_kg()
+        kg2 = mcp_server._get_kg()
+        assert kg1 is kg2
+        assert len(mcp_server._kg_by_path) == 1
+
+    def test_get_kg_different_paths_different_instances(self, tmp_path, monkeypatch):
+        """Different palace paths map to different KG instances."""
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "a"
+        tmp_b = tmp_path / "b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        kg_a = mcp_server._get_kg()
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        kg_b = mcp_server._get_kg()
+
+        assert kg_a is not kg_b
+        assert len(mcp_server._kg_by_path) == 2
+
+    def test_multi_tenant_env_switch(self, tmp_path, monkeypatch):
+        """The issue #1136 acceptance scenario.
+
+        Rotating MEMPALACE_PALACE_PATH between MCP tool calls must route
+        each call to the correct tenant's KG sqlite file.
+        """
+        from mempalace import mcp_server
+
+        tmp_a = tmp_path / "tenant_a"
+        tmp_b = tmp_path / "tenant_b"
+        tmp_a.mkdir()
+        tmp_b.mkdir()
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        add_result = mcp_server.tool_kg_add(
+            subject="alice_secret",
+            predicate="owns",
+            object="repo_a",
+        )
+        assert add_result.get("success") is True, add_result
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_b))
+        query_b = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_b.get("count", 0) == 0, f"tenant B leaked tenant A's fact: {query_b}"
+
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_a))
+        query_a = mcp_server.tool_kg_query(entity="alice_secret")
+        assert query_a.get("count", 0) >= 1, f"tenant A lost its own fact: {query_a}"
+
+    def test_cache_thread_safe(self, tmp_path, monkeypatch):
+        """Concurrent _get_kg() for the same path yields one instance."""
+        import concurrent.futures
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {})
+        monkeypatch.setattr(mcp_server, "_palace_flag_given", True)
+        monkeypatch.setenv("MEMPALACE_PALACE_PATH", str(tmp_path))
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=16) as pool:
+            results = list(pool.map(lambda _: mcp_server._get_kg(), range(16)))
+
+        ids = {id(kg) for kg in results}
+        assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
+        assert len(mcp_server._kg_by_path) == 1

From 84f9726a39e65be3f8afba0d14d4183999520c0d Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 13:03:12 +0500
Subject: [PATCH 040/127] test(mcp): fix Windows subprocess env in KG lazy-init
 test

Passing a stripped env dict without SYSTEMROOT/WINDIR breaks Python
bootstrap on Windows (_Py_HashRandomization_Init). Inherit the parent
env and strip MEMPAL* vars instead, then override HOME/USERPROFILE to
the tmp dir.
---
 tests/test_mcp_server.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 86d5878..638ac15 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1160,14 +1160,12 @@ def test_lazy_init_no_import_side_effect(self, tmp_path):
         import sys
 
         kg_file = tmp_path / ".mempalace" / "knowledge_graph.sqlite3"
+        env = {k: v for k, v in os.environ.items() if not k.startswith("MEMPAL")}
+        env["HOME"] = str(tmp_path)
+        env["USERPROFILE"] = str(tmp_path)
         result = subprocess.run(
             [sys.executable, "-c", "import mempalace.mcp_server"],
-            env={
-                "HOME": str(tmp_path),
-                "USERPROFILE": str(tmp_path),
-                "PATH": os.environ.get("PATH", ""),
-                "PYTHONPATH": os.environ.get("PYTHONPATH", ""),
-            },
+            env=env,
             capture_output=True,
             text=True,
             timeout=30,

From 19f8a4ff682fe5fdad71c6c5d31a45fac6ce2310 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Fri, 24 Apr 2026 13:07:34 +0500
Subject: [PATCH 041/127] style(mcp): drop issue-tracker comments from KG cache
 block

Inline comments referencing #1136 and #540 add no information the
identifiers do not already convey. PR description carries the context;
code stays quiet.
---
 mempalace/mcp_server.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index eae048b..5f74df6 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -105,18 +105,8 @@ def _parse_args():
 
 _config = MempalaceConfig()
 
-# Lazy per-path KG cache. Import no longer creates the sqlite file as a side
-# effect (see issue #1136). The path is resolved on each tool call so that a
-# multi-tenant host rotating MEMPALACE_PALACE_PATH between calls routes each
-# call to the correct KG file, matching the per-call behavior of _get_client()
-# on the ChromaDB side.
 _kg_by_path: dict[str, KnowledgeGraph] = {}
 _kg_cache_lock = threading.Lock()
-
-# Whether --palace was given at startup. Controls default-path resolution:
-# with the flag, KG follows _config.palace_path per call; without it, KG stays
-# on DEFAULT_KG_PATH regardless of env var (issue #540's territory, out of
-# scope here).
 _palace_flag_given: bool = bool(_args.palace)
 
 
From 0a626580513c09bd3a9f7719e53dc3e4810463f5 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 2 May 2026 18:00:36 +0500
Subject: [PATCH 042/127] fix(mcp): drain KG cache on tool_reconnect

tool_reconnect cleared ChromaDB caches but left _kg_by_path entries
intact. After an external replacement of knowledge_graph.sqlite3 the
server kept serving the old open sqlite3.Connection, returning stale
results.

Now iterate _kg_by_path under _kg_cache_lock, call close() best-effort,
and clear the dict so the next tool call reopens the KG from disk.
Two new tests in TestKGLazyCache verify cache invalidation and that a
failing close() does not block the clear.
---
 mempalace/mcp_server.py  | 14 ++++++++++++--
 tests/test_mcp_server.py | 42 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 5f74df6..3b3b4e2 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1418,10 +1418,11 @@ def tool_memories_filed_away():
 
 
 def tool_reconnect():
-    """Force the MCP server to drop the cached ChromaDB collection and reconnect.
+    """Force the MCP server to drop cached ChromaDB + KnowledgeGraph state.
 
     Use after external scripts or CLI commands modify the palace database
-    directly, which can leave the in-memory HNSW index stale.
+    or replace ``knowledge_graph.sqlite3`` directly, which can leave the
+    in-memory HNSW index stale or pin a closed-on-disk SQLite connection.
     """
     global \
         _client_cache, \
@@ -1439,6 +1440,15 @@ def tool_reconnect():
     # still applies after the reconnect.
     _vector_disabled = False
     _vector_disabled_reason = ""
+    # Drain the per-path KnowledgeGraph cache so a replaced sqlite file is
+    # reopened on the next tool call rather than served from a stale handle.
+    with _kg_cache_lock:
+        for kg in _kg_by_path.values():
+            try:
+                kg.close()
+            except Exception:
+                pass
+        _kg_by_path.clear()
     try:
         col = _get_collection()
         if col is None:
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 638ac15..092b707 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1253,3 +1253,45 @@ def test_cache_thread_safe(self, tmp_path, monkeypatch):
         ids = {id(kg) for kg in results}
         assert len(ids) == 1, f"expected 1 unique instance, got {len(ids)}"
         assert len(mcp_server._kg_by_path) == 1
+
+    def test_tool_reconnect_drains_kg_cache(self, monkeypatch):
+        """``tool_reconnect`` must close cached KG instances and clear the dict.
+
+        Without this, an external replacement of ``knowledge_graph.sqlite3``
+        leaves the server pinned to a stale ``sqlite3.Connection``.
+        """
+        from mempalace import mcp_server
+
+        class _FakeKG:
+            def __init__(self):
+                self.closed = False
+
+            def close(self):
+                self.closed = True
+
+        fake_a = _FakeKG()
+        fake_b = _FakeKG()
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": fake_a, "/b": fake_b})
+        # Bypass real ChromaDB so the test isolates KG-cache behaviour.
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert fake_a.closed is True
+        assert fake_b.closed is True
+        assert mcp_server._kg_by_path == {}
+
+    def test_tool_reconnect_swallows_kg_close_errors(self, monkeypatch):
+        """A failing ``close()`` on one cached KG must not block cache clearing."""
+        from mempalace import mcp_server
+
+        class _BoomKG:
+            def close(self):
+                raise RuntimeError("boom")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {"/a": _BoomKG()})
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda: None)
+
+        mcp_server.tool_reconnect()
+
+        assert mcp_server._kg_by_path == {}

From 45df1a265748200f709a1d737bfb8003061cbb32 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Wed, 22 Apr 2026 15:59:43 +0500
Subject: [PATCH 043/127] fix(backends/chroma): release SQLite file lock on
 close_palace/close (#1067)

ChromaBackend.close_palace() and close() evicted cached PersistentClients
from self._clients without calling client.close(), so chromadb 1.5.x kept
the rust-side SQLite file lock until GC. Reopening the same palace path
after shutil.rmtree + re-create within one process then failed with
SQLITE_READONLY_DBMOVED (SQLite code 1032).

Add _close_client() helper with a try/except fallback for older chromadb,
and route close_palace(), close(), and the DB-file-missing invalidation
branch of _client() through it. The mtime/inode auto-invalidation branch
is left as-is: callers there may still hold a live ChromaCollection
handle, and closing out from under them clears the rust bindings mid-use.

Regression tests cover close_palace reopen-same-path and whole-backend
close for multiple palaces.
---
 mempalace/backends/chroma.py | 28 ++++++++++++++++++---
 tests/test_backends.py       | 47 ++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index d9b99a4..e7c2e6f 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -676,6 +676,20 @@ def _as_list(v: Any) -> list:
     return [v]
 
 
+def _close_client(client) -> None:
+    """Call ``PersistentClient.close()`` if available, swallow otherwise.
+
+    chromadb 1.5.x exposes ``Client.close()`` to release rust-side SQLite
+    file locks; older versions relied on GC. Try/except keeps forward-compat.
+    """
+    if client is None:
+        return
+    try:
+        client.close()
+    except Exception:
+        logger.debug("client.close() unavailable or failed", exc_info=True)
+
+
 class ChromaCollection(BaseCollection):
     """Thin adapter translating ChromaDB dict returns into typed results."""
 
@@ -977,7 +991,7 @@ def _client(self, palace_path: str):
         db_path = os.path.join(palace_path, "chroma.sqlite3")
         # DB was present when cache was built but is now missing → invalidate.
         if cached is not None and not os.path.isfile(db_path):
-            self._clients.pop(palace_path, None)
+            _close_client(self._clients.pop(palace_path, None))
             self._freshness.pop(palace_path, None)
             cached = None
             cached_inode, cached_mtime = 0, 0.0
@@ -1134,14 +1148,22 @@ def get_collection(
         return ChromaCollection(collection)
 
     def close_palace(self, palace) -> None:
-        """Drop cached handles for ``palace``. Accepts ``PalaceRef`` or legacy path str."""
+        """Drop cached handles for ``palace`` and release its SQLite file lock.
+
+        Accepts ``PalaceRef`` or legacy path str. chromadb's rust-side file
+        lock is held until ``PersistentClient.close()`` is called, so plain
+        dict eviction would leave the palace path unreopenable and
+        unremovable in the same process.
+        """
         path = palace.local_path if isinstance(palace, PalaceRef) else palace
         if path is None:
             return
-        self._clients.pop(path, None)
+        _close_client(self._clients.pop(path, None))
         self._freshness.pop(path, None)
 
     def close(self) -> None:
+        for client in self._clients.values():
+            _close_client(client)
         self._clients.clear()
         self._freshness.clear()
         self._closed = True
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 4ddfe12..06998c5 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import sqlite3
 from pathlib import Path
 
@@ -206,6 +207,52 @@ def test_query_empty_preserves_embeddings_outer_shape_when_requested():
     assert not_requested.embeddings is None
 
 
+def test_chroma_close_palace_releases_sqlite_lock_for_reopen(tmp_path):
+    """close_palace must release chromadb's rust-side SQLite file lock so
+    a fresh PersistentClient on the same path after shutil.rmtree can
+    write without hitting SQLITE_READONLY_DBMOVED."""
+    backend = ChromaBackend()
+    palace_path = tmp_path / "palace-a"
+    ref = PalaceRef(id=str(palace_path), local_path=str(palace_path))
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["hello"], ids=["a"], metadatas=[{"k": "v"}])
+
+    backend.close_palace(ref)
+    shutil.rmtree(palace_path)
+
+    col = backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+    col.upsert(documents=["world"], ids=["b"], metadatas=[{"k": "v2"}])
+    assert col.count() == 1
+
+
+def test_chroma_close_releases_all_cached_clients(tmp_path):
+    """close() must release every cached client's SQLite file lock so any
+    of their palace paths can be reopened by a fresh backend in the same
+    process."""
+    backend = ChromaBackend()
+    palace_a = tmp_path / "palace-a"
+    palace_b = tmp_path / "palace-b"
+    ref_a = PalaceRef(id=str(palace_a), local_path=str(palace_a))
+    ref_b = PalaceRef(id=str(palace_b), local_path=str(palace_b))
+
+    for ref in (ref_a, ref_b):
+        backend.get_collection(palace=ref, collection_name="mempalace_drawers", create=True).upsert(
+            documents=["x"], ids=["x"], metadatas=[{"k": "v"}]
+        )
+
+    backend.close()
+
+    for path in (palace_a, palace_b):
+        shutil.rmtree(path)
+        ref = PalaceRef(id=str(path), local_path=str(path))
+        fresh = ChromaBackend()
+        col = fresh.get_collection(palace=ref, collection_name="mempalace_drawers", create=True)
+        col.upsert(documents=["y"], ids=["y"], metadatas=[{"k": "v2"}])
+        assert col.count() == 1
+        fresh.close()
+
+
 def test_chroma_cache_invalidates_when_db_file_missing(tmp_path):
     """A palace rebuild that removes chroma.sqlite3 must drop the stale cache.
 

From 7cee74c8c8c6c31acb8b363788443612e36dac77 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Thu, 30 Apr 2026 14:49:02 +0500
Subject: [PATCH 044/127] fix(fact-checker): reconfigure stdio to UTF-8 on
 Windows

The `python -m mempalace.fact_checker --stdin` entry point reads non-ASCII
text through the system ANSI codepage (cp1252/cp1251/cp950) on Windows,
which mojibakes characters before claim-extraction sees them. Reconfigure
stdin/stdout/stderr to UTF-8 with `errors="strict"`, wrapped in try/except
so a replaced stream (Jupyter, test harness) logs a warning rather than
crashing the CLI.

Mirrors the same fix shipped for `mcp_server.py:main()` (#400) and
`hooks_cli.py:run_hook()` (#1280) -- this is the third and last
stdin-reading entry point in the package.
---
 mempalace/fact_checker.py  | 27 +++++++++++++++++
 tests/test_fact_checker.py | 60 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 50e8842..c894859 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -303,11 +303,38 @@ def _edit_distance(s1: str, s2: str) -> int:
     return prev[-1]
 
 
+def _reconfigure_stdio_utf8_on_windows():
+    """Decode --stdin payload as UTF-8 on Windows.
+
+    Without this, Python defaults stdio to the system ANSI codepage
+    (cp1252/cp1251/cp950 depending on locale), which mojibakes
+    non-ASCII fact text before pattern parsing sees it.
+    """
+    import sys
+
+    if sys.platform != "win32":
+        return
+    for name in ("stdin", "stdout", "stderr"):
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors="strict")
+        except Exception as exc:
+            print(
+                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                file=sys.stderr,
+            )
+
+
 if __name__ == "__main__":
     import argparse
     import json
     import sys
 
+    _reconfigure_stdio_utf8_on_windows()
+
     parser = argparse.ArgumentParser(
         description="Check text against known facts in the MemPalace palace.",
         epilog="Exits 0 when no issues found, 1 when one or more issues detected.",
diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py
index 5b34a40..9db370e 100644
--- a/tests/test_fact_checker.py
+++ b/tests/test_fact_checker.py
@@ -286,3 +286,63 @@ def test_exits_nonzero_when_issues_found(self, tmp_path, monkeypatch, capsys):
         assert "similar_name" in out
         # Silence unused import warning.
         _ = (MagicMock, patch, fact_checker)
+
+    def test_reconfigures_stdio_to_utf8_on_windows(self):
+        """Windows fact_checker --stdin must decode payload as UTF-8.
+
+        Without this, Python defaults stdio to the system ANSI codepage
+        (cp1252/cp1251/cp950), which mojibakes non-ASCII text before
+        pattern parsing sees it.
+        """
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self, initial_value=""):
+                super().__init__(initial_value)
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        stdout = _ReconfigurableStringIO()
+        stderr = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "win32"),
+            patch.object(sys, "stdin", stdin),
+            patch.object(sys, "stdout", stdout),
+            patch.object(sys, "stderr", stderr),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        expected = {"encoding": "utf-8", "errors": "strict"}
+        assert stdin.reconfigure_calls == [expected]
+        assert stdout.reconfigure_calls == [expected]
+        assert stderr.reconfigure_calls == [expected]
+
+    def test_reconfigure_stdio_is_noop_off_windows(self):
+        """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+        import io
+        import sys
+
+        from mempalace.fact_checker import _reconfigure_stdio_utf8_on_windows
+
+        class _ReconfigurableStringIO(io.StringIO):
+            def __init__(self):
+                super().__init__()
+                self.reconfigure_calls = []
+
+            def reconfigure(self, **kwargs):
+                self.reconfigure_calls.append(kwargs)
+
+        stdin = _ReconfigurableStringIO()
+        with (
+            patch.object(sys, "platform", "linux"),
+            patch.object(sys, "stdin", stdin),
+        ):
+            _reconfigure_stdio_utf8_on_windows()
+
+        assert stdin.reconfigure_calls == []

From 32f4dfa26d25b8ff243bfd2e636f5e96d8947a83 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Thu, 30 Apr 2026 15:00:37 +0500
Subject: [PATCH 045/127] fix(cli): reconfigure stdio to UTF-8 on Windows

The primary `mempalace` console_script (`cli.py:main()`) reads non-ASCII
arguments via piped stdin and writes verbatim drawer text / wing names
through `print()`. On Windows, Python defaults stdio to the system ANSI
codepage (cp1252/cp1251/cp950), so:

- `mempalace search "..." > out.txt` mojibakes any drawer text containing
  non-Latin characters
- `mempalace ... < input.txt` mojibakes piped non-ASCII input

Reconfigure stdin/stdout/stderr to UTF-8 (`errors="strict"`) at the top
of `main()`, mirroring the helper added in this PR for fact_checker's
`__main__` block. Wrapped in try/except so a replaced stream (Jupyter,
test harness) logs a warning and continues rather than crashing the CLI.

The reconfigure cascades through every `mempalace` subcommand
(`init`/`mine`/`search`/`status`/`hook`/etc.) and through the interactive
flows that read non-ASCII names via `input()` (onboarding, entity
detector, room detector). With this commit the package's three
user-facing entry points (`mempalace`, `mempalace-mcp`, and
`python -m mempalace.fact_checker`) all reconfigure stdio identically on
Windows.
---
 mempalace/cli.py  | 27 ++++++++++++++++++++++++
 tests/test_cli.py | 52 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index f2606a4..7372cd7 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -935,7 +935,34 @@ def cmd_compress(args):
         print("  (dry run -- nothing stored)")
 
 
+def _reconfigure_stdio_utf8_on_windows():
+    """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
+
+    Without this, Python defaults stdio to the system ANSI codepage
+    (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII
+    content piped in (`mempalace search ... < query.txt`) or piped out
+    (`mempalace search "..." > out.txt`) when verbatim drawer text or
+    wing/room names contain non-Latin characters.
+    """
+    if sys.platform != "win32":
+        return
+    for name in ("stdin", "stdout", "stderr"):
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors="strict")
+        except Exception as exc:
+            print(
+                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                file=sys.stderr,
+            )
+
+
 def main():
+    _reconfigure_stdio_utf8_on_windows()
+
     version_label = f"MemPalace {__version__}"
     parser = argparse.ArgumentParser(
         description="MemPalace — Give your AI a memory. No API key required.",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 328b90c..4836d69 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1042,3 +1042,55 @@ def test_cmd_repair_trailing_slash_does_not_recurse():
     palace_path = os.path.expanduser(args.palace).rstrip(os.sep)
     backup_path = palace_path + ".backup"
     assert not backup_path.startswith(palace_path + os.sep)
+
+
+# ── stdio reconfigure on Windows ─────────────────────────────────────
+
+
+class _ReconfigurableStringIO:
+    def __init__(self):
+        self.reconfigure_calls = []
+
+    def reconfigure(self, **kwargs):
+        self.reconfigure_calls.append(kwargs)
+
+
+def test_reconfigures_stdio_to_utf8_on_windows():
+    """Windows `mempalace` CLI must decode/encode stdio as UTF-8.
+
+    Without this, piped non-ASCII input (`mempalace search ... < q.txt`)
+    or piped non-ASCII output (`mempalace search "..." > out.txt`) is
+    mojibaked through the system ANSI codepage on non-Latin Windows
+    locales (cp1252/cp1251/cp950).
+    """
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    stdout = _ReconfigurableStringIO()
+    stderr = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "win32"),
+        patch.object(sys, "stdin", stdin),
+        patch.object(sys, "stdout", stdout),
+        patch.object(sys, "stderr", stderr),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    expected = {"encoding": "utf-8", "errors": "strict"}
+    assert stdin.reconfigure_calls == [expected]
+    assert stdout.reconfigure_calls == [expected]
+    assert stderr.reconfigure_calls == [expected]
+
+
+def test_reconfigure_stdio_is_noop_off_windows():
+    """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""
+    from mempalace.cli import _reconfigure_stdio_utf8_on_windows
+
+    stdin = _ReconfigurableStringIO()
+    with (
+        patch.object(sys, "platform", "linux"),
+        patch.object(sys, "stdin", stdin),
+    ):
+        _reconfigure_stdio_utf8_on_windows()
+
+    assert stdin.reconfigure_calls == []

From 03643eb507e4ba81c65d50b519fcfb4dfb3c769f Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 21:37:12 +0500
Subject: [PATCH 046/127] fix(cli, fact-checker): per-stream stdio errors
 policy on Windows

Previously all three streams reconfigured to UTF-8 with errors='strict'.
That kills 'mempalace search' the moment a drawer carrying a surrogate
half (round-tripped from a filename via surrogateescape) hits print(),
losing the rest of the result block. Same hazard for warning lines on
stderr.

Split the policy:
  stdin  -> surrogateescape (malformed bytes from a redirected file
            survive as lone surrogates instead of crashing the read)
  stdout -> replace (drawer text with a stray surrogate becomes U+FFFD
            instead of UnicodeEncodeError mid-print)
  stderr -> replace (same protection for logger / warning paths)

Applied identically in the cli.py and fact_checker.py helpers; the DRY
extraction into a shared module is a separate cleanup ask, kept out of
this fix to keep the diff narrow.

Tests updated for the new per-stream assertion.
---
 mempalace/cli.py           | 20 ++++++++++++++++++--
 mempalace/fact_checker.py  | 18 ++++++++++++++++--
 tests/test_cli.py          | 11 +++++++----
 tests/test_fact_checker.py | 11 +++++++----
 4 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 7372cd7..7052e1f 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -943,16 +943,32 @@ def _reconfigure_stdio_utf8_on_windows():
     content piped in (`mempalace search ... < query.txt`) or piped out
     (`mempalace search "..." > out.txt`) when verbatim drawer text or
     wing/room names contain non-Latin characters.
+
+    Per-stream errors policy:
+      stdin  -- surrogateescape: malformed bytes from a redirected file
+                survive as lone surrogates instead of crashing the read.
+      stdout -- replace: ``mempalace search`` prints verbatim drawer
+                text. A drawer that round-tripped a filename through
+                surrogateescape can hold a lone surrogate, which would
+                otherwise raise ``UnicodeEncodeError`` mid-print and
+                lose the rest of the search result block.
+      stderr -- replace: same hazard for logger output that quotes
+                user-supplied path or content.
     """
     if sys.platform != "win32":
         return
-    for name in ("stdin", "stdout", "stderr"):
+    policies = (
+        ("stdin", "surrogateescape"),
+        ("stdout", "replace"),
+        ("stderr", "replace"),
+    )
+    for name, errors in policies:
         stream = getattr(sys, name, None)
         reconfigure = getattr(stream, "reconfigure", None)
         if reconfigure is None:
             continue
         try:
-            reconfigure(encoding="utf-8", errors="strict")
+            reconfigure(encoding="utf-8", errors=errors)
         except Exception as exc:
             print(
                 f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index c894859..1844c45 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -309,18 +309,32 @@ def _reconfigure_stdio_utf8_on_windows():
     Without this, Python defaults stdio to the system ANSI codepage
     (cp1252/cp1251/cp950 depending on locale), which mojibakes
     non-ASCII fact text before pattern parsing sees it.
+
+    Per-stream errors policy mirrors the primary CLI helper in
+    ``mempalace/cli.py``:
+      stdin  -- surrogateescape: malformed input bytes survive as lone
+                surrogates instead of crashing the read.
+      stdout -- replace: extracted fact text can include surrogate
+                halves round-tripped from filenames; replace prevents
+                a UnicodeEncodeError mid-print.
+      stderr -- replace: same protection for warning lines.
     """
     import sys
 
     if sys.platform != "win32":
         return
-    for name in ("stdin", "stdout", "stderr"):
+    policies = (
+        ("stdin", "surrogateescape"),
+        ("stdout", "replace"),
+        ("stderr", "replace"),
+    )
+    for name, errors in policies:
         stream = getattr(sys, name, None)
         reconfigure = getattr(stream, "reconfigure", None)
         if reconfigure is None:
             continue
         try:
-            reconfigure(encoding="utf-8", errors="strict")
+            reconfigure(encoding="utf-8", errors=errors)
         except Exception as exc:
             print(
                 f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 4836d69..6b4b7b3 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1076,10 +1076,13 @@ def test_reconfigures_stdio_to_utf8_on_windows():
     ):
         _reconfigure_stdio_utf8_on_windows()
 
-    expected = {"encoding": "utf-8", "errors": "strict"}
-    assert stdin.reconfigure_calls == [expected]
-    assert stdout.reconfigure_calls == [expected]
-    assert stderr.reconfigure_calls == [expected]
+    # Per-stream errors policy: stdin survives bad bytes via
+    # surrogateescape so a redirected non-UTF-8 file does not crash
+    # the read; stdout/stderr use replace so a drawer carrying a
+    # round-tripped surrogate half does not crash mid-print.
+    assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+    assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+    assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
 
 
 def test_reconfigure_stdio_is_noop_off_windows():
diff --git a/tests/test_fact_checker.py b/tests/test_fact_checker.py
index 9db370e..89d8366 100644
--- a/tests/test_fact_checker.py
+++ b/tests/test_fact_checker.py
@@ -318,10 +318,13 @@ def reconfigure(self, **kwargs):
         ):
             _reconfigure_stdio_utf8_on_windows()
 
-        expected = {"encoding": "utf-8", "errors": "strict"}
-        assert stdin.reconfigure_calls == [expected]
-        assert stdout.reconfigure_calls == [expected]
-        assert stderr.reconfigure_calls == [expected]
+        # Per-stream errors policy: stdin uses surrogateescape so a stray
+        # malformed byte from a redirected file does not crash the read,
+        # stdout/stderr use replace so an extracted fact carrying a
+        # surrogate half does not crash mid-print.
+        assert stdin.reconfigure_calls == [{"encoding": "utf-8", "errors": "surrogateescape"}]
+        assert stdout.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
+        assert stderr.reconfigure_calls == [{"encoding": "utf-8", "errors": "replace"}]
 
     def test_reconfigure_stdio_is_noop_off_windows(self):
         """Linux/macOS already default to UTF-8 stdio -- helper must not touch streams."""

From b8816e0fe2fa857efb984e0aff9e52739fb2af34 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 21:43:51 +0500
Subject: [PATCH 047/127] fix(mcp): retry KG handlers once on concurrent close
 race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Race scenario: a KG tool handler calls _get_kg() and gets a live
KnowledgeGraph; another thread fires tool_reconnect() between that
return and the handler's kg.add_triple()/kg.query_entity()/etc call.
tool_reconnect drains _kg_by_path and closes the underlying
sqlite3.Connection; the handler then raises sqlite3.ProgrammingError:
'Cannot operate on a closed database', which surfaces as a -32000
to the MCP client even though the user just asked for a reconnect.

New _call_kg(op) helper wraps each handler's kg call in a one-shot
retry: catch exactly sqlite3.ProgrammingError, evict the stale entry
(only if the cache slot still points at the closed instance — another
thread may have already replaced it), and rerun op against a fresh
_get_kg(). Beyond one retry give up so a sustained close-stream
surfaces clearly instead of looping.

All five KG handlers (tool_kg_query, tool_kg_add, tool_kg_invalidate,
tool_kg_timeline, tool_kg_stats) now route through _call_kg.

Tests pin the contract:
  * retries with a fresh KG and returns the second result
  * non-ProgrammingError exceptions propagate without retry
  * gives up after exactly one retry on sustained close
---
 mempalace/mcp_server.py  | 61 ++++++++++++++++++++++++-------
 tests/test_mcp_server.py | 77 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 13 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 3b3b4e2..c67619e 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -46,6 +46,7 @@
 import json  # noqa: E402
 import logging  # noqa: E402
 import hashlib  # noqa: E402
+import sqlite3  # noqa: E402
 import threading  # noqa: E402
 import time  # noqa: E402
 from datetime import date, datetime  # noqa: E402
@@ -129,6 +130,38 @@ def _get_kg() -> KnowledgeGraph:
     return kg
 
 
+def _call_kg(op):
+    """Run ``op(kg)`` against the cached KG with one-shot retry on close.
+
+    Race we're guarding against: a handler grabs ``kg = _get_kg()`` and is
+    about to call ``kg.add_triple(...)`` when ``tool_reconnect`` fires on
+    another thread, drains ``_kg_by_path``, and closes the underlying
+    sqlite3.Connection. The handler's call then raises
+    ``sqlite3.ProgrammingError: Cannot operate on a closed database`` and
+    bubbles up as a -32000 to the MCP client even though the user just
+    asked for a reconnect.
+
+    Catch that single class of error, evict the stale entry from the
+    cache (only if it still points at the closed instance — another
+    thread may have already replaced it), and try once more with a fresh
+    KG. Beyond one retry give up: a second close means we're losing a
+    sustained race we won't win in this loop, and a hung loop is worse
+    than a clear failure surface.
+    """
+    for attempt in range(2):
+        kg = _get_kg()
+        try:
+            return op(kg)
+        except sqlite3.ProgrammingError:
+            if attempt == 0:
+                path = os.path.abspath(_resolve_kg_path())
+                with _kg_cache_lock:
+                    if _kg_by_path.get(path) is kg:
+                        _kg_by_path.pop(path, None)
+                continue
+            raise
+
+
 _client_cache = None
 _collection_cache = None
 _palace_db_inode = 0  # inode of chroma.sqlite3 at cache time
@@ -1081,7 +1114,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
         return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
-    results = _get_kg().query_entity(entity, as_of=as_of, direction=direction)
+    results = _call_kg(lambda kg: kg.query_entity(entity, as_of=as_of, direction=direction))
     return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
 
 
@@ -1126,15 +1159,17 @@ def tool_kg_add(
             "source_drawer_id": source_drawer_id,
         },
     )
-    triple_id = _get_kg().add_triple(
-        subject,
-        predicate,
-        object,
-        valid_from=valid_from,
-        valid_to=valid_to,
-        source_closet=source_closet,
-        source_file=source_file,
-        source_drawer_id=source_drawer_id,
+    triple_id = _call_kg(
+        lambda kg: kg.add_triple(
+            subject,
+            predicate,
+            object,
+            valid_from=valid_from,
+            valid_to=valid_to,
+            source_closet=source_closet,
+            source_file=source_file,
+            source_drawer_id=source_drawer_id,
+        )
     )
     return {"success": True, "triple_id": triple_id, "fact": f"{subject} → {predicate} → {object}"}
 
@@ -1165,7 +1200,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
             "ended": resolved_ended,
         },
     )
-    _get_kg().invalidate(subject, predicate, object, ended=resolved_ended)
+    _call_kg(lambda kg: kg.invalidate(subject, predicate, object, ended=resolved_ended))
     return {
         "success": True,
         "fact": f"{subject} → {predicate} → {object}",
@@ -1180,13 +1215,13 @@ def tool_kg_timeline(entity: str = None):
             entity = sanitize_kg_value(entity, "entity")
         except ValueError as e:
             return {"error": str(e)}
-    results = _get_kg().timeline(entity)
+    results = _call_kg(lambda kg: kg.timeline(entity))
     return {"entity": entity or "all", "timeline": results, "count": len(results)}
 
 
 def tool_kg_stats():
     """Knowledge graph overview: entities, triples, relationship types."""
-    return _get_kg().stats()
+    return _call_kg(lambda kg: kg.stats())
 
 
 # ==================== AGENT DIARY ====================
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 092b707..e365afc 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1295,3 +1295,80 @@ def close(self):
         mcp_server.tool_reconnect()
 
         assert mcp_server._kg_by_path == {}
+
+    def test_call_kg_retries_after_concurrent_close(self, monkeypatch):
+        """A KG closed mid-handler must trigger a one-shot retry with a fresh
+        instance — not surface a -32000 to the MCP client."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        class _ClosedKG:
+            def query_entity(self, entity, **kwargs):
+                raise _sqlite3.ProgrammingError("Cannot operate on a closed database")
+
+        class _FreshKG:
+            def query_entity(self, entity, **kwargs):
+                return [{"entity": entity}]
+
+        cache = {os.path.abspath(path): _ClosedKG()}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+
+        # Second _get_kg() call (after the cache eviction) constructs a new
+        # KG. Patch the constructor so we don't open a real sqlite file.
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FreshKG())
+
+        result = mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert result == [{"entity": "Alice"}]
+        # The closed instance must be evicted; the fresh one must be cached.
+        assert isinstance(cache[os.path.abspath(path)], _FreshKG)
+
+    def test_call_kg_does_not_retry_on_other_errors(self, monkeypatch):
+        """Non-ProgrammingError exceptions must propagate without retry —
+        we don't want the retry guard masking real bugs."""
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _FailingKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise ValueError("bad input")
+
+        monkeypatch.setattr(mcp_server, "_kg_by_path", {os.path.abspath(path): _FailingKG()})
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _FailingKG())
+
+        with pytest.raises(ValueError, match="bad input"):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 1, "non-ProgrammingError must not trigger retry"
+
+    def test_call_kg_gives_up_after_one_retry(self, monkeypatch):
+        """If the second attempt also hits a closed DB, give up rather than
+        loop forever — a sustained close-stream is a different bug."""
+        import sqlite3 as _sqlite3
+
+        from mempalace import mcp_server
+
+        path = "/fake/palace/knowledge_graph.sqlite3"
+        monkeypatch.setattr(mcp_server, "_resolve_kg_path", lambda: path)
+
+        calls = {"count": 0}
+
+        class _AlwaysClosedKG:
+            def query_entity(self, entity, **kwargs):
+                calls["count"] += 1
+                raise _sqlite3.ProgrammingError("closed again")
+
+        cache = {}
+        monkeypatch.setattr(mcp_server, "_kg_by_path", cache)
+        monkeypatch.setattr(mcp_server, "KnowledgeGraph", lambda **_: _AlwaysClosedKG())
+
+        with pytest.raises(_sqlite3.ProgrammingError):
+            mcp_server._call_kg(lambda kg: kg.query_entity("Alice"))
+        assert calls["count"] == 2, "expected exactly one retry beyond the initial attempt"

From 75ad8ae7819ee9eebdf635cc0c3e969b2f9bc73b Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 22:04:22 +0500
Subject: [PATCH 048/127] ci: retrigger linux 3.13 (transient onnx download
 flake)


From 285b3b4f2e387c1e8eda865569a2edc400f5c1f1 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sun, 3 May 2026 22:25:31 +0500
Subject: [PATCH 049/127] refactor(stdio): extract Windows UTF-8 reconfigure
 into shared helper

Both cli.py and fact_checker.py carried identical 28-line Windows stdio
reconfigure helpers; pull the loop into mempalace/_stdio.py so the same
machine drives the CLI, the fact_checker --stdin entry point, and the
MCP server. The thin per-call-site wrappers stay so existing tests keep
importing _reconfigure_stdio_utf8_on_windows from the same module they
always have.

CLI / fact_checker policy unchanged: stdin=surrogateescape (don't crash
on a malformed redirected file), stdout/stderr=replace (don't crash
mid-print on a surrogate half round-tripped from a filename).
---
 mempalace/_stdio.py       | 71 +++++++++++++++++++++++++++++++++++++++
 mempalace/cli.py          | 45 ++++++-------------------
 mempalace/fact_checker.py | 39 ++++-----------------
 3 files changed, 88 insertions(+), 67 deletions(-)
 create mode 100644 mempalace/_stdio.py

diff --git a/mempalace/_stdio.py b/mempalace/_stdio.py
new file mode 100644
index 0000000..13e9509
--- /dev/null
+++ b/mempalace/_stdio.py
@@ -0,0 +1,71 @@
+"""Stdio UTF-8 reconfiguration helper for Windows entry points.
+
+Python on Windows defaults stdio to the system ANSI codepage
+(cp1252/cp1251/cp950 depending on locale), which mojibakes UTF-8 input
+or output the moment a non-Latin character shows up. Every console
+entry point that touches stdio needs to fix this on Windows -- the MCP
+server, the CLI, the fact_checker `--stdin` mode -- so the
+reconfigure code lives here in one place to keep the per-stream
+errors policies aligned across them.
+
+Per-stream errors policy is caller-chosen:
+
+* MCP server uses ``strict`` on stdout/stderr because everything written
+  there is server-controlled JSON-RPC; any encode failure is a real bug
+  the operator wants loud.
+* CLI / fact_checker use ``replace`` on stdout/stderr because they print
+  verbatim drawer text that may contain surrogate halves round-tripped
+  from filenames -- ``strict`` would crash mid-print.
+* All callers use ``surrogateescape`` on stdin so a malformed byte from
+  a redirected file or a misbehaving client survives as a lone surrogate
+  the consumer's parser surfaces, instead of ``UnicodeDecodeError``
+  killing the read loop on the first bad byte.
+"""
+
+from __future__ import annotations
+
+import sys
+from typing import Callable, Optional
+
+
+def reconfigure_stdio_utf8_on_windows(
+    *,
+    stdin_errors: str = "surrogateescape",
+    stdout_errors: str = "strict",
+    stderr_errors: str = "strict",
+    on_failure: Optional[Callable[[str, BaseException], None]] = None,
+) -> None:
+    """Reconfigure stdio to UTF-8 on Windows. No-op elsewhere.
+
+    Args:
+        stdin_errors: errors= policy for stdin.reconfigure().
+        stdout_errors: errors= policy for stdout.reconfigure().
+        stderr_errors: errors= policy for stderr.reconfigure().
+        on_failure: optional ``(stream_name, exc) -> None`` callback for
+            streams whose ``reconfigure`` raises (e.g. Jupyter-replaced
+            streams that lack the method-shape we expect). Defaults to a
+            ``WARNING:`` line on the original sys.stderr.
+    """
+    if sys.platform != "win32":
+        return
+
+    policies = (
+        ("stdin", stdin_errors),
+        ("stdout", stdout_errors),
+        ("stderr", stderr_errors),
+    )
+    for name, errors in policies:
+        stream = getattr(sys, name, None)
+        reconfigure = getattr(stream, "reconfigure", None)
+        if reconfigure is None:
+            continue
+        try:
+            reconfigure(encoding="utf-8", errors=errors)
+        except Exception as exc:  # noqa: BLE001 -- last-resort guard
+            if on_failure is not None:
+                on_failure(name, exc)
+            else:
+                print(
+                    f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
+                    file=sys.stderr,
+                )
diff --git a/mempalace/cli.py b/mempalace/cli.py
index 7052e1f..0ab3d0f 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -938,42 +938,17 @@ def cmd_compress(args):
 def _reconfigure_stdio_utf8_on_windows():
     """Decode stdio as UTF-8 on Windows for the primary `mempalace` CLI.
 
-    Without this, Python defaults stdio to the system ANSI codepage
-    (cp1252/cp1251/cp950 depending on locale). That mojibakes non-ASCII
-    content piped in (`mempalace search ... < query.txt`) or piped out
-    (`mempalace search "..." > out.txt`) when verbatim drawer text or
-    wing/room names contain non-Latin characters.
-
-    Per-stream errors policy:
-      stdin  -- surrogateescape: malformed bytes from a redirected file
-                survive as lone surrogates instead of crashing the read.
-      stdout -- replace: ``mempalace search`` prints verbatim drawer
-                text. A drawer that round-tripped a filename through
-                surrogateescape can hold a lone surrogate, which would
-                otherwise raise ``UnicodeEncodeError`` mid-print and
-                lose the rest of the search result block.
-      stderr -- replace: same hazard for logger output that quotes
-                user-supplied path or content.
+    Thin wrapper around the shared helper in ``mempalace._stdio``. The CLI
+    overrides stdout/stderr to ``replace`` because ``mempalace search``
+    prints verbatim drawer text that may carry surrogate halves
+    round-tripped from filenames -- ``strict`` would crash mid-print and
+    lose the rest of the search result block. stdin keeps the default
+    ``surrogateescape`` so a redirected non-UTF-8 file does not kill the
+    read on the first bad byte.
     """
-    if sys.platform != "win32":
-        return
-    policies = (
-        ("stdin", "surrogateescape"),
-        ("stdout", "replace"),
-        ("stderr", "replace"),
-    )
-    for name, errors in policies:
-        stream = getattr(sys, name, None)
-        reconfigure = getattr(stream, "reconfigure", None)
-        if reconfigure is None:
-            continue
-        try:
-            reconfigure(encoding="utf-8", errors=errors)
-        except Exception as exc:
-            print(
-                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
-                file=sys.stderr,
-            )
+    from ._stdio import reconfigure_stdio_utf8_on_windows
+
+    reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
 
 
 def main():
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 1844c45..403d913 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -306,40 +306,15 @@ def _edit_distance(s1: str, s2: str) -> int:
 def _reconfigure_stdio_utf8_on_windows():
     """Decode --stdin payload as UTF-8 on Windows.
 
-    Without this, Python defaults stdio to the system ANSI codepage
-    (cp1252/cp1251/cp950 depending on locale), which mojibakes
-    non-ASCII fact text before pattern parsing sees it.
-
-    Per-stream errors policy mirrors the primary CLI helper in
-    ``mempalace/cli.py``:
-      stdin  -- surrogateescape: malformed input bytes survive as lone
-                surrogates instead of crashing the read.
-      stdout -- replace: extracted fact text can include surrogate
-                halves round-tripped from filenames; replace prevents
-                a UnicodeEncodeError mid-print.
-      stderr -- replace: same protection for warning lines.
+    Thin wrapper around the shared helper in ``mempalace._stdio``. Mirrors
+    the primary CLI policy: stdout/stderr use ``replace`` because
+    extracted fact text can include surrogate halves round-tripped from
+    filenames -- ``strict`` would raise UnicodeEncodeError mid-print.
+    stdin keeps the default ``surrogateescape``.
     """
-    import sys
+    from ._stdio import reconfigure_stdio_utf8_on_windows
 
-    if sys.platform != "win32":
-        return
-    policies = (
-        ("stdin", "surrogateescape"),
-        ("stdout", "replace"),
-        ("stderr", "replace"),
-    )
-    for name, errors in policies:
-        stream = getattr(sys, name, None)
-        reconfigure = getattr(stream, "reconfigure", None)
-        if reconfigure is None:
-            continue
-        try:
-            reconfigure(encoding="utf-8", errors=errors)
-        except Exception as exc:
-            print(
-                f"WARNING: Could not reconfigure {name} to UTF-8: {exc}",
-                file=sys.stderr,
-            )
+    reconfigure_stdio_utf8_on_windows(stdout_errors="replace", stderr_errors="replace")
 
 
 if __name__ == "__main__":

From f9d939ae1bd5edba6901f0fd3872cff7ab30f790 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Mon, 4 May 2026 06:45:29 +0000
Subject: [PATCH 050/127] fix(storage): quarantine bloated HNSW link payloads

---
 mempalace/backends/chroma.py      | 155 ++++++++++++++++++------------
 tests/test_hnsw_payload_health.py | 113 ++++++++++++++++++++++
 2 files changed, 209 insertions(+), 59 deletions(-)
 create mode 100644 tests/test_hnsw_payload_health.py

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index d9b99a4..02684d2 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -29,6 +29,51 @@
 _OPTIONAL_OPERATORS = frozenset({"$gt", "$gte", "$lt", "$lte"})
 _SUPPORTED_OPERATORS = _REQUIRED_OPERATORS | _OPTIONAL_OPERATORS
 
+# A healthy HNSW payload should keep link_lists.bin proportional to
+# data_level0.bin. When link_lists.bin grows orders of magnitude larger than
+# data_level0.bin, Chroma/HNSW can segfault while opening the segment even if
+# index_metadata.pickle is structurally valid.
+#
+# The report in #1218 showed ratios above 300x, while healthy snapshots were far below 1x.
+# Treat only >10x as corruption so normal flush lag or small segments do not get
+# quarantined.
+_HNSW_LINK_TO_DATA_MAX_RATIO = 10.0
+
+
+def _hnsw_link_to_data_ratio(seg_dir: str) -> Optional[float]:
+    """Return link_lists.bin / data_level0.bin size ratio for a segment.
+
+    ``None`` means the ratio is not meaningful, usually because one file is
+    missing or data_level0.bin is empty. ``float("inf")`` means the files were
+    present but could not be statted safely, which should be treated as
+    suspicious by callers.
+    """
+
+    link_path = os.path.join(seg_dir, "link_lists.bin")
+    data_path = os.path.join(seg_dir, "data_level0.bin")
+
+    if not (os.path.isfile(link_path) and os.path.isfile(data_path)):
+        return None
+
+    try:
+        data_size = os.path.getsize(data_path)
+        link_size = os.path.getsize(link_path)
+    except OSError:
+        return float("inf")
+
+    if data_size <= 0:
+        return None
+
+    return link_size / data_size
+
+
+def _hnsw_payload_appears_sane(seg_dir: str) -> bool:
+    """Return False when HNSW payload files are structurally implausible."""
+
+    ratio = _hnsw_link_to_data_ratio(seg_dir)
+    return ratio is None or ratio <= _HNSW_LINK_TO_DATA_MAX_RATIO
+
+
 # HNSW tuning to prevent link_lists.bin bloat on large mines (#344).
 #
 # With default params (batch_size=100, sync_threshold=1000, initial capacity
@@ -106,6 +151,9 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
     files and quarantine_stale_hnsw would conservatively rename them
     out of the way (lazy rebuild on next open recovers).
     """
+    if not _hnsw_payload_appears_sane(seg_dir):
+        return False
+
     meta_path = os.path.join(seg_dir, "index_metadata.pickle")
     if not os.path.isfile(meta_path):
         # No metadata file yet — segment hasn't flushed (fresh / empty).
@@ -127,64 +175,35 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
 
 
 def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> list[str]:
-    """Rename HNSW segment dirs that are both stale-by-mtime AND fail an
-    integrity sniff-test.
-
-    Catches the segfault failure mode from #823 (semantic search stale
-    after ``add_drawer``), observed at neo-cortex-mcp#2 (SIGSEGV on
-    ``count()`` with chromadb 1.5.5), and acknowledged as by-design at
-    chroma-core/chroma#2594. Renaming a corrupt segment lets chromadb
-    rebuild lazily on next open instead of segfaulting.
-
-    Two-stage check:
-
-    1. **mtime gate.** If ``chroma.sqlite3`` is less than
-       ``stale_seconds`` newer than the segment's ``data_level0.bin``,
-       skip — chromadb is in normal write-path territory.
-
-    2. **Integrity gate** (``_segment_appears_healthy``). Even when the
-       mtime gap exceeds the threshold, a segment whose
-       ``index_metadata.pickle`` passes a format sniff-test is healthy:
-       chromadb 1.5.x flushes HNSW state asynchronously and a clean
-       shutdown does NOT force-flush, so the on-disk HNSW is *always*
-       somewhat older than ``chroma.sqlite3``. Production observation
-       (2026-04-26 disks daemon): three of three segments quarantined
-       on every cold start, with 538-557s gaps, leaving the 151K-drawer
-       palace with vector_ranked=0 until rebuild. Renaming a healthy
-       segment based on mtime alone destroys a valid index — chromadb
-       creates an empty replacement, orphaning every drawer in sqlite
-       from vector recall until the operator runs ``mempalace repair
-       --mode rebuild`` (15+ min on a 151K palace).
-
-    Only segments that pass stage 1 (suspiciously stale) AND fail stage
-    2 (metadata file truncated, zero-filled, or absent-with-data) are
-    renamed to ``<uuid>.drift-<timestamp>``. The original directory is
-    renamed, not deleted, so recovery remains possible if the heuristic
-    misfires.
-
-    The default threshold (5 min) is advisory under daemon-strict; the
-    integrity gate is what actually distinguishes corruption from flush
-    lag. The threshold still matters for the cross-machine replication
-    case (#823), where it bounds how stale a Syncthing-replicated
-    segment can be before we look harder at it.
-
-    Args:
-        palace_path: path to the palace directory containing ``chroma.sqlite3``
-        stale_seconds: minimum mtime gap to *consider* a segment for quarantine
-
-    Returns:
-        List of paths that were quarantined (empty if nothing actually
-        looked corrupt).
+    """Rename HNSW segment dirs that look unsafe to open.
+
+    This catches two classes of HNSW corruption before ChromaDB opens the
+    native segment reader:
+
+    1. stale-by-mtime segments whose ``index_metadata.pickle`` fails the
+       existing format sniff-test;
+    2. structurally impossible HNSW payloads where ``link_lists.bin`` is much
+       larger than ``data_level0.bin``.
+
+    The second check is intentionally not gated by mtime. A segment with a
+    300x link/data ratio is unsafe regardless of whether its mtime is recent;
+    letting Chroma open it can SIGSEGV before Python fallback code runs.
+
+    The original directory is renamed, not deleted, so recovery remains
+    possible if the heuristic ever misfires.
     """
+
     db_path = os.path.join(palace_path, "chroma.sqlite3")
     if not os.path.isfile(db_path):
         return []
+
     try:
         sqlite_mtime = os.path.getmtime(db_path)
     except OSError:
         return []
 
     moved: list[str] = []
+
     try:
         entries = os.listdir(palace_path)
     except OSError:
@@ -193,29 +212,34 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
     for name in entries:
         if "-" not in name or name.startswith(".") or ".drift-" in name:
             continue
+
         seg_dir = os.path.join(palace_path, name)
         if not os.path.isdir(seg_dir):
             continue
+
         hnsw_bin = os.path.join(seg_dir, "data_level0.bin")
         if not os.path.isfile(hnsw_bin):
             continue
+
         try:
             hnsw_mtime = os.path.getmtime(hnsw_bin)
         except OSError:
             continue
-        if sqlite_mtime - hnsw_mtime < stale_seconds:
+
+        payload_ratio = _hnsw_link_to_data_ratio(seg_dir)
+        payload_corrupt = payload_ratio is not None and payload_ratio > _HNSW_LINK_TO_DATA_MAX_RATIO
+
+        if not payload_corrupt and sqlite_mtime - hnsw_mtime < stale_seconds:
             continue
 
-        # Stage 2: integrity gate. mtime drift is necessary but not
-        # sufficient — chromadb's async flush makes drift the steady-
-        # state condition. A healthy segment metadata file proves
-        # chromadb can open the segment without segfault; don't
-        # quarantine a healthy index.
-        if _segment_appears_healthy(seg_dir):
+        # Stage 2: integrity gate. Mtime drift alone is not corruption because
+        # Chroma flushes HNSW asynchronously. A healthy metadata file proves the
+        # ordinary stale-by-mtime case is just flush lag.
+        if not payload_corrupt and _segment_appears_healthy(seg_dir):
             logger.info(
                 "HNSW mtime gap %.0fs on %s exceeds threshold but segment "
-                "metadata file is intact — flush-lag, not corruption. "
-                "Leaving in place.",
+                "metadata and payload size are intact — flush-lag, not "
+                "corruption. Leaving in place.",
                 sqlite_mtime - hnsw_mtime,
                 seg_dir,
             )
@@ -223,17 +247,30 @@ def quarantine_stale_hnsw(palace_path: str, stale_seconds: float = 300.0) -> lis
 
         stamp = _dt.datetime.now().strftime("%Y%m%d-%H%M%S")
         target = f"{seg_dir}.drift-{stamp}"
+
+        if payload_corrupt:
+            reason = (
+                f"link_lists.bin/data_level0.bin ratio {payload_ratio:.1f}x "
+                f"exceeds {_HNSW_LINK_TO_DATA_MAX_RATIO:.1f}x"
+            )
+        else:
+            reason = (
+                f"sqlite {sqlite_mtime - hnsw_mtime:.0f}s newer than HNSW "
+                "and integrity check failed"
+            )
+
         try:
             os.rename(seg_dir, target)
             moved.append(target)
             logger.warning(
-                "Quarantined corrupt HNSW segment %s (sqlite %.0fs newer than HNSW, integrity check failed); renamed to %s",
+                "Quarantined corrupt HNSW segment %s (%s); renamed to %s",
                 seg_dir,
-                sqlite_mtime - hnsw_mtime,
+                reason,
                 target,
             )
         except OSError:
             logger.exception("Failed to quarantine corrupt HNSW segment %s", seg_dir)
+
     return moved
 
 
diff --git a/tests/test_hnsw_payload_health.py b/tests/test_hnsw_payload_health.py
new file mode 100644
index 0000000..0af440a
--- /dev/null
+++ b/tests/test_hnsw_payload_health.py
@@ -0,0 +1,113 @@
+import os
+from pathlib import Path
+
+from mempalace.backends.chroma import (
+    _HNSW_LINK_TO_DATA_MAX_RATIO,
+    _hnsw_link_to_data_ratio,
+    _segment_appears_healthy,
+    quarantine_stale_hnsw,
+)
+
+
+def _write_segment(
+    seg_dir: Path,
+    *,
+    data_size: int = 100,
+    link_size: int = 100,
+    write_metadata: bool = True,
+) -> None:
+    seg_dir.mkdir(parents=True, exist_ok=True)
+    (seg_dir / "data_level0.bin").write_bytes(b"\0" * data_size)
+    (seg_dir / "link_lists.bin").write_bytes(b"\0" * link_size)
+
+    if write_metadata:
+        # Enough bytes to pass the existing pickle envelope sniff-test:
+        # starts with pickle protocol marker 0x80 and ends with STOP 0x2e.
+        (seg_dir / "index_metadata.pickle").write_bytes(b"\x80" + b"x" * 16 + b"\x2e")
+
+
+def test_hnsw_link_to_data_ratio_reports_payload_size_ratio(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(seg_dir, data_size=100, link_size=250)
+
+    assert _hnsw_link_to_data_ratio(str(seg_dir)) == 2.5
+
+
+def test_segment_health_rejects_exploded_link_lists_even_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    assert not _segment_appears_healthy(str(seg_dir))
+
+
+def test_segment_health_keeps_reasonable_payload_with_valid_pickle(tmp_path):
+    seg_dir = tmp_path / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * _HNSW_LINK_TO_DATA_MAX_RATIO),
+        write_metadata=True,
+    )
+
+    assert _segment_appears_healthy(str(seg_dir))
+
+
+def test_quarantine_catches_link_bloat_without_mtime_drift(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=int(100 * (_HNSW_LINK_TO_DATA_MAX_RATIO + 1)),
+        write_metadata=True,
+    )
+
+    # Make sqlite and HNSW mtimes identical. The old mtime-only gate would
+    # skip this segment even though the payload is structurally corrupt.
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert len(moved) == 1
+    assert not seg_dir.exists()
+
+    moved_path = Path(moved[0])
+    assert moved_path.exists()
+    assert moved_path.name.startswith("11111111-2222-3333-4444-555555555555.drift-")
+
+
+def test_quarantine_leaves_reasonable_payload_in_place(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_text("sqlite placeholder")
+
+    seg_dir = palace / "11111111-2222-3333-4444-555555555555"
+    _write_segment(
+        seg_dir,
+        data_size=100,
+        link_size=100,
+        write_metadata=True,
+    )
+
+    same_time = 1_700_000_000
+    os.utime(db_path, (same_time, same_time))
+    os.utime(seg_dir / "data_level0.bin", (same_time, same_time))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=999_999)
+
+    assert moved == []
+    assert seg_dir.exists()

From 4f36145c2e51aa53c705c67a3f692174230bbcf9 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Sun, 26 Apr 2026 13:01:55 +0200
Subject: [PATCH 051/127] fix(entity_registry): atomic write to prevent partial
 corruption on crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

EntityRegistry.save() called Path.write_text() directly, which truncates
the target file and then writes — so a crash mid-write (power loss, OOM,
filesystem-full mid-flush) leaves an empty or half-written
entity_registry.json. The whole people/projects map is lost; the system
falls back to an empty registry on next load.

Switch to the standard atomic-write pattern: serialize to a sibling
.tmp file in the same directory (so os.replace stays on one filesystem),
fsync, chmod 0o600, then os.replace over the target. The replace is
atomic on POSIX and Windows, so any crash leaves the previous registry
intact instead of a truncated file.

Tests cover: no leftover .tmp on success, and previous content preserved
when os.replace itself raises mid-save.
---
 mempalace/entity_registry.py  | 15 ++++++++++--
 tests/test_entity_registry.py | 46 +++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py
index 78d8a8b..d77b6c1 100644
--- a/mempalace/entity_registry.py
+++ b/mempalace/entity_registry.py
@@ -16,6 +16,7 @@
 """
 
 import json
+import os
 import re
 import urllib.request
 import urllib.parse
@@ -320,11 +321,21 @@ def save(self):
             self._path.parent.chmod(0o700)
         except (OSError, NotImplementedError):
             pass
-        self._path.write_text(json.dumps(self._data, indent=2), encoding="utf-8")
+        # Atomic write: serialize to a sibling temp file in the same dir
+        # (so os.replace stays on one filesystem), fsync, then rename over
+        # the target. A crash mid-write leaves the previous registry intact
+        # instead of a half-written file or an empty file from the truncate.
+        payload = json.dumps(self._data, indent=2)
+        tmp_path = self._path.with_name(self._path.name + ".tmp")
+        with open(tmp_path, "w", encoding="utf-8") as f:
+            f.write(payload)
+            f.flush()
+            os.fsync(f.fileno())
         try:
-            self._path.chmod(0o600)
+            tmp_path.chmod(0o600)
         except (OSError, NotImplementedError):
             pass
+        os.replace(tmp_path, self._path)
 
     @staticmethod
     def _empty() -> dict:
diff --git a/tests/test_entity_registry.py b/tests/test_entity_registry.py
index c857a07..a5f237c 100644
--- a/tests/test_entity_registry.py
+++ b/tests/test_entity_registry.py
@@ -2,6 +2,8 @@
 
 from unittest.mock import patch
 
+import pytest
+
 from mempalace.entity_registry import (
     COMMON_ENGLISH_WORDS,
     PERSON_CONTEXT_PATTERNS,
@@ -71,6 +73,50 @@ def test_save_creates_file(tmp_path):
     assert (tmp_path / "entity_registry.json").exists()
 
 
+def test_save_is_atomic_does_not_leave_tmp(tmp_path):
+    # Atomic write must not leave the .tmp sidecar file after a successful save.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.save()
+    leftover = list(tmp_path.glob("entity_registry.json.tmp*"))
+    assert leftover == [], f"atomic write leaked tmp file(s): {leftover}"
+
+
+def test_save_preserves_previous_on_serialization_failure(tmp_path, monkeypatch):
+    # If serialization fails mid-write, the previous registry must remain
+    # intact — this is the whole point of atomic write vs truncating in place.
+    registry = EntityRegistry.load(config_dir=tmp_path)
+    registry.seed(
+        mode="personal",
+        people=[{"name": "Alice", "relationship": "friend", "context": "personal"}],
+        projects=[],
+    )
+    registry.save()
+    target = tmp_path / "entity_registry.json"
+    original = target.read_text(encoding="utf-8")
+
+    # Force os.replace to raise — simulates filesystem full / permission flip
+    # AFTER the temp file is written but BEFORE the rename completes.
+    import os as _os
+
+    real_replace = _os.replace
+
+    def boom(src, dst):
+        raise OSError("simulated rename failure")
+
+    monkeypatch.setattr(_os, "replace", boom)
+    with pytest.raises(OSError):
+        registry.seed(
+            mode="personal",
+            people=[{"name": "Bob", "relationship": "friend", "context": "personal"}],
+            projects=[],
+        )
+        registry.save()
+
+    # Restore os.replace before reading so the assertion can rely on it.
+    monkeypatch.setattr(_os, "replace", real_replace)
+    assert target.read_text(encoding="utf-8") == original
+
+
 # ── seed ────────────────────────────────────────────────────────────────
 
 
From 2e441d17a22f9e14dcb9d8576551ea867e56cda8 Mon Sep 17 00:00:00 2001
From: Arnold Wender <arnold.wender@gmail.com>
Date: Tue, 28 Apr 2026 09:43:27 +0200
Subject: [PATCH 052/127] fix(entity_registry): fsync parent dir after rename
 for ext4 durability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this, on ext4 (and similar) filesystems the rename ack does not
guarantee durability across power loss — a crash can revert to a state
where the temp file is present and the target is at the old version.

Suggested by @jphein on #1215.
---
 mempalace/entity_registry.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/mempalace/entity_registry.py b/mempalace/entity_registry.py
index d77b6c1..c8ac517 100644
--- a/mempalace/entity_registry.py
+++ b/mempalace/entity_registry.py
@@ -336,6 +336,20 @@ def save(self):
         except (OSError, NotImplementedError):
             pass
         os.replace(tmp_path, self._path)
+        # On ext4 (and similar) the rename's durability across power loss
+        # requires an additional fsync on the parent directory. Without it,
+        # the kernel can ack the rename and a crash reverts to the state
+        # where the temp file is present and the target is at the old version.
+        try:
+            dir_fd = os.open(str(self._path.parent), os.O_RDONLY)
+            try:
+                os.fsync(dir_fd)
+            finally:
+                os.close(dir_fd)
+        except OSError:
+            # Windows and some special filesystems reject directory fds — they
+            # have different durability semantics on rename anyway.
+            pass
 
     @staticmethod
     def _empty() -> dict:

From eff844b1680510c2ab38456bc166bfe42c57effa Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Mon, 4 May 2026 10:34:22 +0000
Subject: [PATCH 053/127] fix(storage): quarantine partial HNSW flush without
 metadata

---
 mempalace/backends/chroma.py | 39 ++++++++++++++++++--------
 tests/test_backends.py       | 54 ++++++++++++++++++++++++++++++++++--
 2 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/mempalace/backends/chroma.py b/mempalace/backends/chroma.py
index d9b99a4..c611af5 100644
--- a/mempalace/backends/chroma.py
+++ b/mempalace/backends/chroma.py
@@ -54,6 +54,13 @@
     "hnsw:sync_threshold": 50_000,
 }
 
+# Missing index_metadata.pickle is normal only while a segment is still fresh
+# or effectively empty. Once data_level0.bin has non-trivial payload, a
+# missing metadata pickle means the segment was interrupted after writing HNSW
+# data but before writing its metadata. Letting Chroma open that shape can
+# segfault or hang in native HNSW code.
+_HNSW_MISSING_METADATA_DATA_FLOOR = 1024
+
 
 def _validate_where(where: Optional[dict]) -> None:
     """Scan a where-clause for unknown operators and raise ``UnsupportedFilterError``.
@@ -84,16 +91,13 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
     parsing it. ChromaDB writes that file after a successful HNSW flush;
     a complete write starts with byte ``0x80`` and ends with byte
     ``0x2e`` (the protocol/terminator byte sequence chromadb serializes
-    with). If both bytes are present and the file is non-trivially sized,
-    chromadb will load the segment cleanly even when its on-disk mtime
-    trails ``chroma.sqlite3`` — which is the *steady state* under
-    chromadb 1.5.x's async batched flush, not corruption.
+    with).
 
-    A missing metadata file is treated as "fresh / never-flushed" and
-    considered healthy. Renaming an empty dir orphans nothing, and a
-    real corruption case manifests as a present-but-malformed file or a
-    chromadb load error caught downstream by palace-daemon's
-    ``_auto_repair`` retry path.
+    Missing metadata is healthy only while the segment still looks fresh or
+    empty. If ``data_level0.bin`` already has non-trivial payload but
+    ``index_metadata.pickle`` is missing, the segment is partially flushed:
+    Chroma wrote vector data without the metadata it needs to reopen the
+    HNSW reader safely.
 
     Deliberately format-sniffs only; never deserializes. Deserialization
     can execute arbitrary code, and the byte-sniff is sufficient to
@@ -104,13 +108,24 @@ def _segment_appears_healthy(seg_dir: str) -> bool:
     chromadb writes today; if a future chromadb version emits protocol
     0/1 segments, this check would start returning False on healthy
     files and quarantine_stale_hnsw would conservatively rename them
-    out of the way (lazy rebuild on next open recovers).
+    out of the way.
     """
+
     meta_path = os.path.join(seg_dir, "index_metadata.pickle")
     if not os.path.isfile(meta_path):
-        # No metadata file yet — segment hasn't flushed (fresh / empty).
-        # Renaming would orphan nothing; consider healthy.
+        data_path = os.path.join(seg_dir, "data_level0.bin")
+        try:
+            if (
+                os.path.isfile(data_path)
+                and os.path.getsize(data_path) > _HNSW_MISSING_METADATA_DATA_FLOOR
+            ):
+                return False
+        except OSError:
+            return False
+
+        # No metadata and no meaningful vector payload yet: fresh/empty segment.
         return True
+
     try:
         size = os.path.getsize(meta_path)
         # A real chromadb metadata file is at least tens of bytes; a
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 4ddfe12..3a8392b 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -16,8 +16,10 @@
 from mempalace.backends.chroma import (
     ChromaBackend,
     ChromaCollection,
+    _HNSW_MISSING_METADATA_DATA_FLOOR,
     _fix_blob_seq_ids,
     _pin_hnsw_threads,
+    _segment_appears_healthy,
     quarantine_stale_hnsw,
 )
 
@@ -636,9 +638,9 @@ def test_quarantine_stale_hnsw_leaves_healthy_segment_with_drift_alone(tmp_path)
     assert seg.exists()
 
 
-def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
-    """Segment with no metadata file is treated as fresh / never-flushed
-    and not quarantined — renaming an empty dir orphans nothing."""
+def test_quarantine_stale_hnsw_leaves_empty_segment_without_metadata_alone(tmp_path):
+    """Missing metadata is okay only when the segment has no meaningful data yet."""
+
     now = 1_700_000_000.0
     palace, seg = _make_palace_with_segment(
         tmp_path,
@@ -646,11 +648,57 @@ def test_quarantine_stale_hnsw_leaves_segment_without_metadata_alone(tmp_path):
         sqlite_mtime=now,
         meta_bytes=None,
     )
+
     moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+
     assert moved == []
     assert seg.exists()
 
 
+def test_segment_without_metadata_but_with_nontrivial_data_is_unhealthy(tmp_path):
+    """Data without index_metadata.pickle is a partial flush, not a fresh segment."""
+
+    seg = tmp_path / "abcd-1234-5678"
+    seg.mkdir()
+    (seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
+
+    assert not _segment_appears_healthy(str(seg))
+
+
+def test_segment_without_metadata_and_tiny_data_is_still_treated_as_fresh(tmp_path):
+    """Tiny data payloads can occur before metadata has flushed; leave them alone."""
+
+    seg = tmp_path / "abcd-1234-5678"
+    seg.mkdir()
+    (seg / "data_level0.bin").write_bytes(b"\0" * _HNSW_MISSING_METADATA_DATA_FLOOR)
+
+    assert _segment_appears_healthy(str(seg))
+
+
+def test_quarantine_stale_hnsw_renames_missing_metadata_with_nontrivial_data(tmp_path):
+    """Regression for #1274: missing pickle + non-trivial data must quarantine."""
+
+    now = 1_700_000_000.0
+    palace, seg = _make_palace_with_segment(
+        tmp_path,
+        hnsw_mtime=now - 7200,
+        sqlite_mtime=now,
+        meta_bytes=None,
+    )
+    (seg / "data_level0.bin").write_bytes(b"\0" * (_HNSW_MISSING_METADATA_DATA_FLOOR + 1))
+    os.utime(seg / "data_level0.bin", (now - 7200, now - 7200))
+
+    moved = quarantine_stale_hnsw(str(palace), stale_seconds=3600.0)
+
+    assert len(moved) == 1
+    assert ".drift-" in moved[0]
+    assert not seg.exists()
+
+    drift_dirs = [p for p in palace.iterdir() if ".drift-" in p.name]
+    assert len(drift_dirs) == 1
+    assert (drift_dirs[0] / "data_level0.bin").exists()
+
+
 def test_quarantine_stale_hnsw_renames_truncated_metadata(tmp_path):
     """Segment with a truncated (under-floor-size) metadata file is
     quarantined — shape of a partial-flush during process kill."""

From 37e7d394b8cc5ada0508dff62feb54cfb7cb0716 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Tue, 5 May 2026 07:22:10 +0000
Subject: [PATCH 054/127] fix(repair): preflight poisoned max_seq_id

---
 mempalace/cli.py     | 19 +++++++++++--
 mempalace/repair.py  | 59 ++++++++++++++++++++++++++++++++++++++-
 tests/test_repair.py | 66 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 4 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index f2606a4..103c4ae 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -654,7 +654,11 @@ def cmd_repair(args):
     import shutil
     from .backends.chroma import ChromaBackend
     from .migrate import confirm_destructive_action, contains_palace_database
-    from .repair import TruncationDetected, check_extraction_safety
+    from .repair import (
+        TruncationDetected,
+        check_extraction_safety,
+        maybe_repair_poisoned_max_seq_id_before_rebuild,
+    )
 
     palace_path = os.path.abspath(
         os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
@@ -679,11 +683,20 @@ def cmd_repair(args):
         print(f"\n  No palace found at {palace_path}")
         return
     if not contains_palace_database(palace_path):
-        print(f"\n  No palace database found at {db_path}")
+        print(f"\n No palace database found at {db_path}")
+        return
+
+    preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
+        palace_path,
+        backup=getattr(args, "backup", True),
+        dry_run=getattr(args, "dry_run", False),
+        assume_yes=getattr(args, "yes", False),
+    )
+    if preflight is not None:
         return
 
     print(f"\n{'=' * 55}")
-    print("  MemPalace Repair")
+    print(" MemPalace Repair")
     print(f"{'=' * 55}\n")
     print(f"  Palace: {palace_path}")
 
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..09970de 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -330,6 +330,56 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
         return None
 
 
+def maybe_repair_poisoned_max_seq_id_before_rebuild(
+    palace_path: str,
+    *,
+    backup: bool = True,
+    dry_run: bool = False,
+    assume_yes: bool = False,
+) -> "dict | None":
+    """Run non-destructive max_seq_id repair before a rebuild if needed.
+
+    A poisoned ``max_seq_id`` row can make Chroma believe it has already
+    consumed every row in ``embeddings_queue``. Writes then report success
+    because they land in the queue, but they never become visible in
+    ``embeddings``.
+
+    If this precise corruption is present, do the narrow bookmark repair and
+    stop instead of continuing into the legacy rebuild path. The rebuild path
+    extracts only already-visible embeddings and can discard queued writes.
+    """
+
+    db_path = os.path.join(palace_path, "chroma.sqlite3")
+    if not os.path.isfile(db_path):
+        return None
+
+    try:
+        poisoned = _detect_poisoned_max_seq_ids(db_path)
+    except Exception:
+        return None
+
+    if not poisoned:
+        return None
+
+    print("\n Detected poisoned max_seq_id rows before repair rebuild.")
+    print(
+        " This can make writes report success while embeddings_queue grows "
+        "and embeddings stay static."
+    )
+    print(" Running the non-destructive max_seq_id repair instead of rebuilding " "the collection.")
+    print(
+        " Queued writes remain in chroma.sqlite3 for Chroma to drain after "
+        "the bookmark is unpoisoned."
+    )
+
+    return repair_max_seq_id(
+        palace_path,
+        backup=backup,
+        dry_run=dry_run,
+        assume_yes=assume_yes,
+    )
+
+
 def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     """Rebuild the HNSW index from scratch.
 
@@ -353,7 +403,14 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     print(f"\n{'=' * 55}")
     print("  MemPalace Repair — Index Rebuild")
     print(f"{'=' * 55}\n")
-    print(f"  Palace: {palace_path}")
+    print(f" Palace: {palace_path}")
+
+    preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
+        palace_path,
+        assume_yes=True,
+    )
+    if preflight is not None:
+        return
 
     backend = ChromaBackend()
     try:
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..6f1802f 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -682,3 +682,69 @@ def flaky_detect(*args, **kwargs):
     # A backup file is still present — caller can roll back from it.
     leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
     assert leftover
+
+
+def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
+    """#1295: default repair preflight must not drop queued writes."""
+
+    palace = str(tmp_path / "palace")
+    seg = _seed_poisoned_max_seq_id(
+        palace,
+        drawers_meta_max=102,
+        closets_meta_max=11,
+    )
+    db_path = os.path.join(palace, "chroma.sqlite3")
+
+    with sqlite3.connect(db_path) as conn:
+        conn.executemany(
+            "INSERT INTO embeddings_queue(seq_id, topic, id) VALUES (?, ?, ?)",
+            [
+                (seq_id, "persistent://default/default/mempalace_drawers", f"queued-{seq_id}")
+                for seq_id in range(103, 123)
+            ],
+        )
+        conn.commit()
+
+    result = repair.maybe_repair_poisoned_max_seq_id_before_rebuild(
+        palace,
+        assume_yes=True,
+    )
+
+    assert result is not None
+    assert result["segment_repaired"]
+
+    with sqlite3.connect(db_path) as conn:
+        max_seq_rows = dict(conn.execute("SELECT segment_id, seq_id FROM max_seq_id"))
+        queue_count = conn.execute("SELECT COUNT(*) FROM embeddings_queue").fetchone()[0]
+
+    assert max_seq_rows[seg["drawers_vec"]] == seg["drawers_meta_max"]
+    assert max_seq_rows[seg["drawers_meta"]] == seg["drawers_meta_max"]
+    assert max_seq_rows[seg["closets_vec"]] == seg["closets_meta_max"]
+    assert max_seq_rows[seg["closets_meta"]] == seg["closets_meta_max"]
+
+    # The old legacy rebuild path can discard queued writes. The preflight
+    # repair must leave them on disk for Chroma to drain after the bookmark is
+    # unpoisoned.
+    assert queue_count == 20
+
+
+def test_rebuild_index_repairs_poisoned_max_seq_id_before_collection_rebuild(tmp_path, capsys):
+    """A poisoned bookmark should short-circuit before the legacy rebuild path."""
+
+    palace = str(tmp_path / "palace")
+    _seed_poisoned_max_seq_id(palace)
+
+    with patch("mempalace.repair.ChromaBackend") as mock_backend:
+        repair.rebuild_index(palace)
+
+    out = capsys.readouterr().out
+    backend = mock_backend.return_value
+
+    # repair_max_seq_id may instantiate ChromaBackend to close cached clients
+    # after editing sqlite directly. That is safe. The important thing is that
+    # rebuild_index must not continue into the legacy Chroma collection read /
+    # count / rebuild path after the max_seq_id preflight handles the issue.
+    backend.get_collection.assert_not_called()
+
+    assert "Detected poisoned max_seq_id rows" in out
+    assert "non-destructive max_seq_id repair" in out

From bb40a529fde600ba01b49e6ead06820968bba446 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Tue, 5 May 2026 09:01:05 +0000
Subject: [PATCH 055/127] fix(migrate): verify write roundtrip before bailout

---
 mempalace/migrate.py  |  73 +++++++++++++++++++++++++++---
 tests/test_migrate.py | 101 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 167 insertions(+), 7 deletions(-)

diff --git a/mempalace/migrate.py b/mempalace/migrate.py
index 76aa054..5b74591 100644
--- a/mempalace/migrate.py
+++ b/mempalace/migrate.py
@@ -22,6 +22,7 @@
 import os
 import shutil
 import sqlite3
+import uuid
 from collections import defaultdict
 from datetime import datetime
 
@@ -155,6 +156,55 @@ def confirm_destructive_action(
     return True
 
 
+def _result_ids(result) -> list:
+    """Return ids from either the backend typed result or raw Chroma dict."""
+
+    if isinstance(result, dict):
+        return list(result.get("ids") or [])
+
+    return list(getattr(result, "ids", []) or [])
+
+
+def collection_write_roundtrip_works(col) -> bool:
+    """Return True only if the collection can upsert, read, and delete.
+
+    Some ChromaDB 0.6.x -> 1.5.x migrated collections remain readable while
+    writes and deletes silently no-op. A plain ``count()`` probe misses that
+    failure mode, so migrate must verify an actual write round-trip before
+    deciding that no rebuild is needed.
+    """
+
+    probe_id = f"_mempalace_migrate_probe_{uuid.uuid4().hex}"
+    probe_doc = "mempalace migrate write round-trip probe"
+    probe_meta = {
+        "wing": "_mempalace_probe",
+        "room": "_mempalace_probe",
+        "source_file": "mempalace_migrate_probe",
+        "chunk_index": 0,
+    }
+
+    try:
+        col.upsert(
+            ids=[probe_id],
+            documents=[probe_doc],
+            metadatas=[probe_meta],
+        )
+
+        after_upsert = col.get(ids=[probe_id], include=[])
+        if probe_id not in _result_ids(after_upsert):
+            return False
+
+        col.delete(ids=[probe_id])
+
+        after_delete = col.get(ids=[probe_id], include=[])
+        if probe_id in _result_ids(after_delete):
+            return False
+
+        return True
+    except Exception:
+        return False
+
+
 def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
     """Migrate a palace to the currently installed ChromaDB version."""
     from .backends.chroma import ChromaBackend
@@ -179,16 +229,27 @@ def migrate(palace_path: str, dry_run: bool = False, confirm: bool = False):
     print(f"  Source:    ChromaDB {source_version}")
     print(f"  Target:    ChromaDB {target_version}")
 
-    # Try reading with current chromadb first
+    # Try reading and writing with current chromadb first.
+    #
+    # A plain count() is not enough: some 0.6.x -> 1.5.x migrated collections
+    # are readable but silently drop upsert/delete operations. In that state,
+    # migrate must rebuild from SQLite instead of returning "No migration needed."
     try:
         col = ChromaBackend().get_collection(palace_path, "mempalace_drawers")
         count = col.count()
-        print(f"\n  Palace is already readable by chromadb {target_version}.")
-        print(f"  {count} drawers found. No migration needed.")
-        return True
+
+        if collection_write_roundtrip_works(col):
+            print(f"\n Palace is already readable and writable by chromadb {target_version}.")
+            print(f" {count} drawers found. No migration needed.")
+            return True
+
+        print(
+            f"\n Palace is readable by chromadb {target_version}, but write/delete verification failed."
+        )
+        print(" Rebuilding from SQLite to restore native write/delete behavior...")
     except Exception:
-        print(f"\n  Palace is NOT readable by chromadb {target_version}.")
-        print("  Extracting from SQLite directly...")
+        print(f"\n Palace is NOT readable by chromadb {target_version}.")
+        print(" Extracting from SQLite directly...")
 
     # Extract all drawers via raw SQL
     drawers = extract_drawers_from_sqlite(db_path)
diff --git a/tests/test_migrate.py b/tests/test_migrate.py
index 4701048..ba12ff5 100644
--- a/tests/test_migrate.py
+++ b/tests/test_migrate.py
@@ -4,7 +4,7 @@
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
 
-from mempalace.migrate import _restore_stale_palace, migrate
+from mempalace.migrate import collection_write_roundtrip_works, _restore_stale_palace, migrate
 
 
 def test_migrate_requires_palace_database(tmp_path, capsys):
@@ -101,3 +101,102 @@ def test_restore_stale_palace_logs_and_swallows_on_failure(tmp_path, capsys):
     assert "CRITICAL" in out
     assert os.fspath(palace_path) in out
     assert os.fspath(stale_path) in out
+
+
+class _FakeGetResult:
+    def __init__(self, ids):
+        self.ids = ids
+
+
+class _WritableFakeCollection:
+    def __init__(self):
+        self.ids = set()
+        self.deleted = []
+
+    def upsert(self, *, ids, documents, metadatas):
+        self.ids.update(ids)
+
+    def get(self, *, ids, include=None):
+        return _FakeGetResult([drawer_id for drawer_id in ids if drawer_id in self.ids])
+
+    def delete(self, *, ids=None, where=None):
+        for drawer_id in ids or []:
+            self.ids.discard(drawer_id)
+            self.deleted.append(drawer_id)
+
+
+class _SilentWriteDropCollection(_WritableFakeCollection):
+    def upsert(self, *, ids, documents, metadatas):
+        return None
+
+
+class _SilentDeleteDropCollection(_WritableFakeCollection):
+    def delete(self, *, ids=None, where=None):
+        self.deleted.extend(ids or [])
+
+
+def test_collection_write_roundtrip_works_when_probe_persists_and_deletes():
+    col = _WritableFakeCollection()
+
+    assert collection_write_roundtrip_works(col) is True
+    assert col.ids == set()
+    assert len(col.deleted) == 1
+
+
+def test_collection_write_roundtrip_fails_when_upsert_silently_drops():
+    col = _SilentWriteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert col.ids == set()
+
+
+def test_collection_write_roundtrip_fails_when_delete_silently_drops():
+    col = _SilentDeleteDropCollection()
+
+    assert collection_write_roundtrip_works(col) is False
+    assert len(col.ids) == 1
+
+
+def test_migrate_dry_run_rebuilds_when_collection_is_readable_but_not_writable(tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+
+    fake_col = MagicMock()
+    fake_col.count.return_value = 102
+
+    drawers = [
+        {
+            "id": "id1",
+            "document": "hello",
+            "metadata": {"wing": "test-wing", "room": "general"},
+        }
+    ]
+
+    with (
+        patch("mempalace.migrate.detect_chromadb_version", return_value="1.x"),
+        patch("mempalace.backends.chroma.ChromaBackend") as mock_backend,
+        patch(
+            "mempalace.migrate.collection_write_roundtrip_works", return_value=False
+        ) as mock_probe,
+        patch(
+            "mempalace.migrate.extract_drawers_from_sqlite", return_value=drawers
+        ) as mock_extract,
+    ):
+        mock_backend.backend_version.return_value = "1.5.8"
+        mock_backend.return_value.get_collection.return_value = fake_col
+
+        result = migrate(str(palace_dir), dry_run=True)
+
+    out = capsys.readouterr().out
+
+    assert result is True
+    mock_probe.assert_called_once_with(fake_col)
+    mock_extract.assert_called_once_with(
+        os.path.join(os.path.abspath(os.fspath(palace_dir)), "chroma.sqlite3")
+    )
+
+    assert "readable by chromadb 1.5.8, but write/delete verification failed" in out
+    assert "Rebuilding from SQLite" in out
+    assert "Extracted 1 drawers from SQLite" in out
+    assert "DRY RUN" in out

From 6b042982e8ead30e3ca2ed20ede7118b227af5cc Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Tue, 5 May 2026 15:51:43 +0000
Subject: [PATCH 056/127] fix(repair): preflight SQLite integrity before
 rebuild

---
 mempalace/repair.py  | 70 +++++++++++++++++++++++++++++++++++++
 tests/test_repair.py | 82 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 147 insertions(+), 5 deletions(-)

diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..6834a18 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -330,6 +330,71 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
         return None
 
 
+def sqlite_integrity_errors(palace_path: str) -> list[str]:
+    """Return SQLite quick_check errors for chroma.sqlite3.
+
+    The repair rebuild path eventually calls Chroma's delete_collection().
+    If the SQLite layer has corrupt secondary indexes or FTS5 shadow pages,
+    Chroma can raise an opaque SQLITE_CORRUPT_INDEX / code 779 error before
+    repair reaches the HNSW rebuild.
+
+    Run a direct SQLite quick_check first so repair can fail with a clear,
+    actionable message before invoking Chroma's destructive collection-delete
+    path.
+    """
+
+    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+    if not os.path.exists(sqlite_path):
+        return []
+
+    try:
+        with sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True) as conn:
+            rows = conn.execute("PRAGMA quick_check").fetchall()
+    except sqlite3.Error as e:
+        return [f"PRAGMA quick_check failed: {e}"]
+
+    errors: list[str] = []
+    for row in rows:
+        if not row:
+            continue
+        message = str(row[0])
+        if message.lower() != "ok":
+            errors.append(message)
+
+    return errors
+
+
+def print_sqlite_integrity_abort(palace_path: str, errors: list[str]) -> None:
+    """Print a clear repair abort message for SQLite-layer corruption."""
+
+    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+    preview = errors[:5]
+
+    print("\n ABORT: SQLite-layer corruption detected before repair rebuild.")
+    print(" `mempalace repair` will not call Chroma delete_collection() because")
+    print(" the SQLite database failed `PRAGMA quick_check`.")
+    print()
+    print(f" Database: {sqlite_path}")
+    print()
+    print(" quick_check output:")
+    for message in preview:
+        print(f"  - {message}")
+    if len(errors) > len(preview):
+        print(f"  ... and {len(errors) - len(preview)} more issue(s)")
+    print()
+    print(" This often means derived SQLite structures, such as secondary indexes")
+    print(" or FTS5 shadow tables, are corrupt while the underlying rows may still")
+    print(" be recoverable.")
+    print()
+    print(" Suggested recovery:")
+    print("  1. Stop all MemPalace writers / MCP clients.")
+    print("  2. Back up the entire palace directory.")
+    print("  3. Recover chroma.sqlite3 offline with sqlite3 `.recover` or `.dump`.")
+    print("  4. Recreate the FTS5 virtual table from intact embedding_metadata rows.")
+    print("  5. Verify `PRAGMA integrity_check` returns `ok`.")
+    print("  6. Re-run `mempalace repair --yes`.")
+
+
 def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     """Rebuild the HNSW index from scratch.
 
@@ -397,6 +462,11 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
         print(e.message)
         return
 
+    sqlite_errors = sqlite_integrity_errors(palace_path)
+    if sqlite_errors:
+        print_sqlite_integrity_abort(palace_path, sqlite_errors)
+        return
+
     # Back up ONLY the SQLite database, not the bloated HNSW files
     sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
     backup_path = sqlite_path + ".backup"
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..ac1761f 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -216,9 +216,11 @@ def test_rebuild_index_empty_palace(mock_backend_cls, mock_shutil, tmp_path):
 @patch("mempalace.repair.shutil")
 @patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
-    # Create a fake sqlite file
+    # Create a valid sqlite file so the repair preflight can run quick_check.
     sqlite_path = tmp_path / "chroma.sqlite3"
-    sqlite_path.write_text("fake")
+    with sqlite3.connect(sqlite_path) as conn:
+        conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
+        conn.commit()
 
     mock_col = MagicMock()
     mock_col.count.return_value = 2
@@ -234,15 +236,15 @@ def test_rebuild_index_success(mock_backend_cls, mock_shutil, tmp_path):
 
     repair.rebuild_index(palace_path=str(tmp_path))
 
-    # Verify: backed up sqlite only (not copytree)
+    # Verify: backed up sqlite only, not copytree.
     mock_shutil.copy2.assert_called_once()
     assert "chroma.sqlite3" in str(mock_shutil.copy2.call_args)
 
-    # Verify: deleted and recreated (cosine is the backend default)
+    # Verify: deleted and recreated.
     mock_backend.delete_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
     mock_backend.create_collection.assert_called_once_with(str(tmp_path), "mempalace_drawers")
 
-    # Verify: used upsert not add
+    # Verify: used upsert, not add.
     mock_new_col.upsert.assert_called_once()
     mock_new_col.add.assert_not_called()
 
@@ -682,3 +684,73 @@ def flaky_detect(*args, **kwargs):
     # A backup file is still present — caller can roll back from it.
     leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
     assert leftover
+
+
+def test_sqlite_integrity_errors_returns_empty_for_healthy_db(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    db_path = palace / "chroma.sqlite3"
+
+    with sqlite3.connect(db_path) as conn:
+        conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
+        conn.commit()
+
+    assert repair.sqlite_integrity_errors(str(palace)) == []
+
+
+def test_sqlite_integrity_errors_reports_unreadable_sqlite_file(tmp_path):
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    db_path = palace / "chroma.sqlite3"
+    db_path.write_bytes(b"not a sqlite database")
+
+    errors = repair.sqlite_integrity_errors(str(palace))
+
+    assert errors
+    assert "quick_check failed" in errors[0]
+
+
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collection(
+    mock_backend_cls,
+    mock_shutil,
+    tmp_path,
+    capsys,
+):
+    """Regression for #1362: fail before Chroma delete_collection on sqlite corruption."""
+
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    with sqlite3.connect(sqlite_path) as conn:
+        conn.execute("CREATE TABLE dummy(id INTEGER PRIMARY KEY)")
+        conn.commit()
+
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+
+    with patch(
+        "mempalace.repair.sqlite_integrity_errors",
+        return_value=[
+            "Page 4 of B-tree 12345: database disk image is malformed",
+            "Page 8 of B-tree 67890: database disk image is malformed",
+        ],
+    ):
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    out = capsys.readouterr().out
+
+    assert "SQLite-layer corruption detected before repair rebuild" in out
+    assert "PRAGMA quick_check" in out
+    assert "delete_collection" in out
+    assert "Page 4 of B-tree" in out
+
+    mock_backend.delete_collection.assert_not_called()
+    mock_backend.create_collection.assert_not_called()
+    mock_shutil.copy2.assert_not_called()

From 2c0ef2c04e8e1987c98dac87c4a5950f9f86c0f3 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:38:57 -0300
Subject: [PATCH 057/127] docs(changelog): document v3.3.5 fixes from #1214
 #1105 #1215 #1107 #1282 #1167 #1160

Bundled CHANGELOG entries for the seven Tier-1 PRs merged today, including
the behavior-change call-out for #1167 (KG date validators now reject
non-ISO inputs that previously produced silent empty results).
---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d3982fe..ba6822a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 ### Bug Fixes
 
 - **`mempalace_diary_read` silently dropped entries on agent-name case mismatch.** `tool_diary_write` stored the `agent` metadata verbatim after `sanitize_name`, which preserves case, while `tool_diary_read` filtered by exact match. Writing as `"Claude"` and reading as `"claude"` (or vice-versa) returned zero rows. Both endpoints now lowercase `agent_name` immediately after sanitization, so reads are case-insensitive and the default per-agent wing slug is stable across casings. **Behavior change:** entries written prior to this fix under mixed-case agent names will not match the new lowercase filter; run `mempalace repair` if you need to migrate legacy diary metadata. (#1243)
+- **Knowledge-graph triples with `valid_to < valid_from` were silently invisible.** `KnowledgeGraph.query_entity()` filters with `valid_from <= as_of AND valid_to >= as_of`, so an inverted interval matches no `as_of` and the row is durably stored but unreachable — a P0 data-integrity foot-gun any caller that mixes up the two date params can hit. `add_triple()` now rejects inverted intervals at write time with a clear `ValueError` naming both bounds. Open intervals (one bound only) and point-in-time facts (`valid_from == valid_to`) remain accepted unchanged. (#1214)
+- **`ChromaBackend.close_palace()` / `close()` did not release the SQLite file lock.** Evicted clients sat in `_clients` without `close()`, and chromadb 1.5.x retains the rust-side SQLite lock until GC. Reopening the same palace path after `shutil.rmtree` + recreate within one process failed with `SQLITE_READONLY_DBMOVED` (code 1032). New `_close_client()` helper now calls `PersistentClient.close()` (with a try/except fallback for older chromadb) on `close_palace()`, on whole-backend `close()`, and on the `_client()` invalidation path that detects a missing `chroma.sqlite3`. The mtime/inode auto-invalidation branch is intentionally left alone — callers there may still hold a live `ChromaCollection`. (#1067, #1105)
+- **`EntityRegistry.save()` could leave a corrupt or empty `entity_registry.json` on crash.** `Path.write_text()` is not atomic — kernel sees `open('w')` (truncate), `write`, `close`, and any failure between truncate and full-flush (power loss, OOM, FS-full, kill -9) wipes the months-of-mining people/projects map silently (the registry's `load()` swallows `JSONDecodeError`). Save now writes to a sibling `.tmp` in the same directory, `fsync`s, `chmod 0o600`s, then `os.replace()`s into place — atomic on POSIX and Windows. The previous registry stays intact on any crash before the rename returns. (#1215)
+- **`mempalace compress` crashed on large palaces.** `regenerate_closets` fetched all closet_llm drawers in a single `col.get()`, which trips `SQLITE_MAX_VARIABLE_NUMBER` on palaces above ~32k drawers. Mirrors the #851 fix in `miner.py`: drawer fetch is now paginated at `batch_size=5000`. Per-source aggregation works across batches, so the LLM regeneration call still groups chunks correctly. (#1073, #1107)
+- **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
+- **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)
+- **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
 
 ---
 

From d1e27b8c42f3892046bb20f950d7b2e04726e230 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:47:46 -0300
Subject: [PATCH 058/127] style: ruff format new test files (CI lint)

---
 tests/test_chroma_collection_lock.py | 18 ++++++------------
 tests/test_palace_locks.py           |  6 +++---
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/tests/test_chroma_collection_lock.py b/tests/test_chroma_collection_lock.py
index b5d30fb..536b5e8 100644
--- a/tests/test_chroma_collection_lock.py
+++ b/tests/test_chroma_collection_lock.py
@@ -255,12 +255,8 @@ def test_concurrent_writers_serialize(tmp_path, monkeypatch):
     ctx = _get_mp_context()
     result_q = ctx.Queue()
 
-    p1 = ctx.Process(
-        target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q)
-    )
-    p2 = ctx.Process(
-        target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q)
-    )
+    p1 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 1, result_q))
+    p2 = ctx.Process(target=_slow_writer_target, args=(palace, str(tmp_path), 2, result_q))
     p1.start()
     # Tiny stagger so p1 wins the race deterministically; without it the
     # OS scheduler can pick either, which is also a valid outcome but
@@ -272,9 +268,7 @@ def test_concurrent_writers_serialize(tmp_path, monkeypatch):
 
     outcomes = [result_q.get(timeout=1) for _ in range(2)]
     statuses = sorted(o[0] for o in outcomes)
-    assert statuses == ["busy", "ok"], (
-        f"expected one ok + one busy, got {outcomes}"
-    )
+    assert statuses == ["busy", "ok"], f"expected one ok + one busy, got {outcomes}"
 
 
 def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
@@ -319,9 +313,9 @@ def test_read_path_does_not_acquire_lock(tmp_path, monkeypatch):
             if method is None:
                 continue
             src = inspect.getsource(method)
-            assert "_write_lock" not in src, (
-                f"{read_attr} must NOT acquire the write lock (read path)"
-            )
+            assert (
+                "_write_lock" not in src
+            ), f"{read_attr} must NOT acquire the write lock (read path)"
     finally:
         open(release, "w").close()
         holder.join(timeout=5)
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 39aa50c..d239757 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -194,9 +194,9 @@ def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
         child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
         child.start()
         child.join(timeout=5)
-        assert result_q.get(timeout=1) == "busy", (
-            "outer lock should still be held by parent after inner re-entrant exit"
-        )
+        assert (
+            result_q.get(timeout=1) == "busy"
+        ), "outer lock should still be held by parent after inner re-entrant exit"
 
 
 def _try_acquire_expect_busy(palace_path, result_q):

From f854da779fef5634e5719a45b59b3c18acb6270b Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:57:44 -0300
Subject: [PATCH 059/127] fix(lint): hoist hooks_cli_mod import to top of
 test_hooks_cli (E402)

The alias was placed below an explanatory comment block introduced by
#1305, which trips ruff E402 (module-level import not at top of file).
Moved next to the existing 'from mempalace.hooks_cli import (...)' line.

CI lint went red on develop after #1305 merged with the failing check;
this re-greens it so subsequent PRs do not inherit the failure.
---
 tests/test_hooks_cli.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 487acf7..19ecbaf 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -8,6 +8,7 @@
 
 import pytest
 
+import mempalace.hooks_cli as hooks_cli_mod
 from mempalace.hooks_cli import (
     SAVE_INTERVAL,
     _count_human_messages,
@@ -969,9 +970,6 @@ def test_stop_hook_rejects_injected_stop_hook_active(tmp_path):
 # STATE_DIR.mkdir() on its own.
 
 
-import mempalace.hooks_cli as hooks_cli_mod
-
-
 def _redirect_palace_root(monkeypatch, tmp_path):
     """Point PALACE_ROOT and STATE_DIR at a tmp location that does NOT exist."""
     fake_root = tmp_path / "absent-mempalace"

From 733e4353321678ea1a55c093ad1b3d1d5e8c5313 Mon Sep 17 00:00:00 2001
From: Chris Antenesse <cantenesse@predictap.com>
Date: Sat, 18 Apr 2026 15:08:01 -0500
Subject: [PATCH 060/127] fix(searcher): guard against None metadata/doc in
 search result loops

ChromaDB can return None entries in metadatas/documents lists under
partial-flush, mid-delete, upgrade-boundary, and interrupted-mine
states. Add `meta = meta or {}` and `doc = doc or ""` guards in the
three result loops (search display, closet hybrid, drawer scored) so
.get() and .strip() calls never crash on None.

Fixes #1007, #1011
---
 mempalace/searcher.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index d615623..16ea4eb 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -340,7 +340,7 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
     # `_hybrid_rank`; do the same here so CLI results match what agents
     # see via `mempalace_search`.
     hits = [
-        {"text": doc, "distance": float(dist), "metadata": meta or {}}
+        {"text": doc or "", "distance": float(dist), "metadata": meta or {}}
         for doc, meta, dist in zip(docs, metas, dists)
     ]
     hits = _hybrid_rank(hits, query)
@@ -809,6 +809,8 @@ def search_memories(
         _first_or_empty(drawer_results, "metadatas"),
         _first_or_empty(drawer_results, "distances"),
     ):
+        meta = meta or {}
+        doc = doc or ""
         # Filter on raw distance before rounding to avoid precision loss.
         if max_distance > 0.0 and dist > max_distance:
             continue

From 5347c2c71c782d04fc2023504f58dc52c2369187 Mon Sep 17 00:00:00 2001
From: eldar702 <eldarshlomi7@gmail.com>
Date: Sun, 19 Apr 2026 11:08:45 +0300
Subject: [PATCH 061/127] fix(searcher): clamp effective_distance to valid
 cosine range [0, 2]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``search_memories`` computes ``effective_dist = dist - boost`` where
``boost`` can be as large as ``CLOSET_RANK_BOOSTS[0] == 0.40`` for a
rank-0 closet hit. When the raw drawer distance is small — any
near-exact match — the subtraction goes negative.

Two downstream effects:

1. Line 418 returns ``round(max(0.0, 1 - effective_dist), 3)`` as
   ``similarity``. With ``effective_dist = -0.30`` that yields
   ``similarity = 1.30``, outside the documented ``[0, 1]`` range.
   The ``max(0.0, ...)`` only prevents negative similarities; it does
   not cap above 1.
2. Line 427 stores ``_sort_key: effective_dist`` and line 435 sorts
   ``scored`` ascending by that key. A negative key drops *below* the
   rest, so the strongest hybrid matches end up sorting after weaker
   ones — ranking inversion under the exact conditions hybrid retrieval
   is supposed to serve best.

Clamp ``effective_dist`` to the valid cosine-distance range ``[0, 2]``.
The boost still wins (closet-backed hit still ranks first), it just no
longer flips the order.

Test added: mock drawer_col (base dist 0.08 / 0.35 for two sources) +
closet_col (rank-0 closet for the 0.08 source) → assert all hits have
``0 <= similarity <= 1`` and ``0 <= effective_distance <= 2``, and that
the closet-boosted source still ranks first.

Relationship to other PRs:

* **#988** clamps the output ``similarity`` alone. That does not fix
  the sort-key inversion or the invalid ``effective_distance`` in the
  returned dict. This PR clamps at the arithmetic source so both
  downstream users of the value stay in range.
* Orthogonal to **#979** (``tool_check_duplicate`` negative similarity).
---
 mempalace/searcher.py  |  7 +++++-
 tests/test_searcher.py | 57 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index d615623..e99fc5d 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -825,7 +825,12 @@ def search_memories(
                 matched_via = "drawer+closet"
                 closet_preview = c_preview
 
-        effective_dist = dist - boost
+        # Clamp to the valid cosine-distance range [0, 2]. When a strong
+        # closet boost (up to 0.40) exceeds the raw distance, the subtraction
+        # can go negative — which (a) yields ``similarity > 1.0`` downstream
+        # and (b) makes the sort key land *below* ordinary positive distances,
+        # inverting the ranking so the best hybrid matches sort last.
+        effective_dist = max(0.0, min(2.0, dist - boost))
         entry = {
             "text": doc,
             "wing": meta.get("wing", "unknown"),
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 6b85832..ad60641 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -120,6 +120,63 @@ def mock_get_collection(path, create=False):
         assert none_hit["wing"] == "unknown"
         assert none_hit["room"] == "unknown"
 
+    def test_effective_distance_clamped_to_valid_cosine_range(self):
+        """A strong closet boost (up to 0.40) applied to a low-distance drawer
+        can drive ``dist - boost`` negative. That violates the cosine-distance
+        invariant ``[0, 2]``: the API returns ``similarity > 1.0`` and the
+        internal ``_sort_key`` sinks below ordinary positive distances,
+        inverting the ranking so the best hybrid matches sort last.
+
+        With the clamp, ``effective_distance`` stays in ``[0, 2]``,
+        ``similarity`` stays in ``[0, 1]``, and the sort order is stable.
+        """
+        # Drawer a.md gets a tiny base distance (0.08) — nearly exact match.
+        # Drawer b.md gets a larger base distance (0.35).
+        drawers_col = MagicMock()
+        drawers_col.query.return_value = {
+            "documents": [["doc-a", "doc-b"]],
+            "metadatas": [[
+                {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+            ]],
+            "distances": [[0.08, 0.35]],
+            "ids": [["d-a", "d-b"]],
+        }
+        # A strong closet at rank 0 points at a.md → boost = 0.40,
+        # which exceeds a.md's base distance and would go negative without
+        # the clamp. No closet for b.md.
+        closets_col = MagicMock()
+        closets_col.query.return_value = {
+            "documents": [["closet-preview-a"]],
+            "metadatas": [[{"source_file": "a.md"}]],
+            "distances": [[0.2]],  # within CLOSET_DISTANCE_CAP (1.5)
+            "ids": [["c-a"]],
+        }
+
+        with (
+            patch("mempalace.searcher.get_collection", return_value=drawers_col),
+            patch("mempalace.searcher.get_closets_collection", return_value=closets_col),
+        ):
+            result = search_memories("query", "/fake/path", n_results=5)
+
+        hits = result["results"]
+        assert hits, "should return results"
+
+        # Invariants on every hit.
+        for h in hits:
+            assert 0.0 <= h["similarity"] <= 1.0, (
+                f"similarity out of range: {h['similarity']} for {h['source_file']}"
+            )
+            assert 0.0 <= h["effective_distance"] <= 2.0, (
+                f"effective_distance out of range: {h['effective_distance']} "
+                f"for {h['source_file']}"
+            )
+
+        # With the clamp, the closet-boosted a.md still ranks ahead of b.md —
+        # the boost still wins, but it no longer flips the ranking.
+        assert hits[0]["source_file"] == "a.md"
+        assert hits[0]["matched_via"] == "drawer+closet"
+
 
 # ── BM25 internals: None / empty document safety ─────────────────────
 

From aac8437979a562ab5b0c14541a22da4996e03250 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:53:59 -0300
Subject: [PATCH 062/127] style: ruff format tests/test_searcher.py (CI lint)

---
 tests/test_searcher.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index ad60641..4f0b4c0 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -135,10 +135,12 @@ def test_effective_distance_clamped_to_valid_cosine_range(self):
         drawers_col = MagicMock()
         drawers_col.query.return_value = {
             "documents": [["doc-a", "doc-b"]],
-            "metadatas": [[
-                {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
-                {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
-            ]],
+            "metadatas": [
+                [
+                    {"source_file": "a.md", "wing": "w", "room": "r", "chunk_index": 0},
+                    {"source_file": "b.md", "wing": "w", "room": "r", "chunk_index": 0},
+                ]
+            ],
             "distances": [[0.08, 0.35]],
             "ids": [["d-a", "d-b"]],
         }
@@ -164,9 +166,9 @@ def test_effective_distance_clamped_to_valid_cosine_range(self):
 
         # Invariants on every hit.
         for h in hits:
-            assert 0.0 <= h["similarity"] <= 1.0, (
-                f"similarity out of range: {h['similarity']} for {h['source_file']}"
-            )
+            assert (
+                0.0 <= h["similarity"] <= 1.0
+            ), f"similarity out of range: {h['similarity']} for {h['source_file']}"
             assert 0.0 <= h["effective_distance"] <= 2.0, (
                 f"effective_distance out of range: {h['effective_distance']} "
                 f"for {h['source_file']}"

From 0fdb480e12caaf894838886ca6db2ad68e3d67b6 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 01:55:43 +0300
Subject: [PATCH 063/127] fix(mcp): handle null JSON-RPC request payloads
 safely

When the MCP client sends a malformed or null top-level request, prevent the AttributeError on request.get() by explicitly validating that the request is a dictionary. Returns standard JSON-RPC Error -32600 (Invalid Request) instead of crashing the server.
---
 mempalace/mcp_server.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..71a96df 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1968,6 +1968,8 @@ def tool_reconnect():
 
 
 def handle_request(request):
+    if not isinstance(request, dict):
+        return {"jsonrpc": "2.0", "error": {"code": -32600, "message": "Invalid Request"}}
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")

From 55d79dc8cd03d408644b05cb44aa277edaf2eb86 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 16:53:53 +0300
Subject: [PATCH 064/127] fix: include null id in JSON-RPC invalid request
 error responses and add validation tests

---
 mempalace/mcp_server.py  | 2 +-
 tests/test_mcp_server.py | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 71a96df..227e91a 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1969,7 +1969,7 @@ def tool_reconnect():
 
 def handle_request(request):
     if not isinstance(request, dict):
-        return {"jsonrpc": "2.0", "error": {"code": -32600, "message": "Invalid Request"}}
+        return {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index b036afd..0a020d3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -190,6 +190,13 @@ def test_malformed_method_none(self):
         resp = handle_request({"method": None, "id": 99, "params": {}})
         assert resp["error"]["code"] == -32601
 
+    @pytest.mark.parametrize("payload", [None, [], "plain", 42, True])
+    def test_handle_request_invalid_payload_returns_jsonrpc_error(self, payload):
+        from mempalace.mcp_server import handle_request
+
+        resp = handle_request(payload)
+        assert resp == {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+
     def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)
         from mempalace.mcp_server import handle_request

From a85d432b544060795ff57796d713c597122f6293 Mon Sep 17 00:00:00 2001
From: Oleksii Pylypchuk <alpi@keemail.me>
Date: Sat, 18 Apr 2026 22:05:34 +0300
Subject: [PATCH 065/127] feat: add validation for missing name parameter in
 tools/call requests

---
 mempalace/mcp_server.py  |  6 ++++++
 tests/test_mcp_server.py | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 227e91a..25439f2 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2007,6 +2007,12 @@ def handle_request(request):
             },
         }
     elif method == "tools/call":
+        if not isinstance(params, dict) or "name" not in params:
+            return {
+                "jsonrpc": "2.0",
+                "id": req_id,
+                "error": {"code": -32602, "message": "Invalid params: 'name' is required for tools/call"},
+            }
         tool_name = params.get("name")
         tool_args = params.get("arguments") or {}
         if tool_name not in TOOLS:
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 0a020d3..fc56b34 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -148,6 +148,20 @@ def test_unknown_tool(self):
         )
         assert resp["error"]["code"] == -32601
 
+    def test_tools_call_missing_params(self):
+        from mempalace.mcp_server import handle_request
+
+        for bad_params in [None, {}, {"arguments": {}}]:
+            resp = handle_request(
+                {
+                    "method": "tools/call",
+                    "id": 15,
+                    "params": bad_params,
+                }
+            )
+            assert resp["error"]["code"] == -32602
+            assert "Invalid params" in resp["error"]["message"]
+
     def test_unknown_method(self):
         from mempalace.mcp_server import handle_request
 

From 869ab3809570fa578e9dd634ec31b0b1817dddc7 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:54:11 -0300
Subject: [PATCH 066/127] style: ruff format mcp_server.py + test_mcp_server.py
 (CI lint)

---
 mempalace/mcp_server.py  | 11 +++++++++--
 tests/test_mcp_server.py |  6 +++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 25439f2..30a0bea 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1969,7 +1969,11 @@ def tool_reconnect():
 
 def handle_request(request):
     if not isinstance(request, dict):
-        return {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+        return {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
     method = request.get("method") or ""
     params = request.get("params") or {}
     req_id = request.get("id")
@@ -2011,7 +2015,10 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "error": {"code": -32602, "message": "Invalid params: 'name' is required for tools/call"},
+                "error": {
+                    "code": -32602,
+                    "message": "Invalid params: 'name' is required for tools/call",
+                },
             }
         tool_name = params.get("name")
         tool_args = params.get("arguments") or {}
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index fc56b34..c073830 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -209,7 +209,11 @@ def test_handle_request_invalid_payload_returns_jsonrpc_error(self, payload):
         from mempalace.mcp_server import handle_request
 
         resp = handle_request(payload)
-        assert resp == {"jsonrpc": "2.0", "id": None, "error": {"code": -32600, "message": "Invalid Request"}}
+        assert resp == {
+            "jsonrpc": "2.0",
+            "id": None,
+            "error": {"code": -32600, "message": "Invalid Request"},
+        }
 
     def test_tools_call_dispatches(self, monkeypatch, config, palace_path, seeded_kg):
         _patch_mcp_server(monkeypatch, config, seeded_kg)

From 7b49478ef7049d124ae6f35e55da2f77cadfddf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E7=A5=96=E9=91=AB=28940219=29?= <940219@nd.com.cn>
Date: Fri, 1 May 2026 20:46:36 +0800
Subject: [PATCH 067/127] fix: MCP server JSON output ensure_ascii=False for
 non-ASCII support

Without ensure_ascii=False, non-ASCII characters (e.g. Chinese) in tool
results and JSON-RPC responses are escaped as \uXXXX, which causes
downstream MCP clients to receive escaped text instead of the original
characters. This affects all platforms, not just Windows.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 mempalace/mcp_server.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..f0973d3 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2053,7 +2053,7 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2)}]},
+                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}]},
             }
         except Exception:
             logger.exception(f"Tool error in {tool_name}")
@@ -2114,7 +2114,7 @@ def main():
             request = json.loads(line)
             response = handle_request(request)
             if response is not None:
-                sys.stdout.write(json.dumps(response) + "\n")
+                sys.stdout.write(json.dumps(response, ensure_ascii=False) + "\n")
                 sys.stdout.flush()
         except KeyboardInterrupt:
             break

From 74288f1cdd92f6717186865caa1412d43a13fbc7 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 01:54:24 -0300
Subject: [PATCH 068/127] style: ruff format mcp_server.py (CI lint)

---
 mempalace/mcp_server.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index f0973d3..fa69aa5 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -2053,7 +2053,11 @@ def handle_request(request):
             return {
                 "jsonrpc": "2.0",
                 "id": req_id,
-                "result": {"content": [{"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}]},
+                "result": {
+                    "content": [
+                        {"type": "text", "text": json.dumps(result, indent=2, ensure_ascii=False)}
+                    ]
+                },
             }
         except Exception:
             logger.exception(f"Tool error in {tool_name}")

From eef053d75093645faae6348c08b17d1f6d733a5e Mon Sep 17 00:00:00 2001
From: bobo-xxx <111567133+bobo-xxx@users.noreply.github.com>
Date: Sat, 18 Apr 2026 12:40:58 +0800
Subject: [PATCH 069/127] fix(mcp_server): clamp similarity to [0,1] to avoid
 negative values

---
 mempalace/mcp_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index ca71f60..dbe4497 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -740,7 +740,7 @@ def tool_check_duplicate(content: str, threshold: float = 0.9):
         if results["ids"] and results["ids"][0]:
             for i, drawer_id in enumerate(results["ids"][0]):
                 dist = results["distances"][0][i]
-                similarity = round(1 - dist, 3)
+                similarity = round(max(0.0, 1 - dist), 3)
                 if similarity >= threshold:
                     # Chroma 1.5.x can return None for partially-flushed rows;
                     # coerce to empty sentinels so downstream .get() is safe.

From f2bed9284fccaec51dd78266f9113db41a9e966f Mon Sep 17 00:00:00 2001
From: bobo-xxx <111567133+bobo-xxx@users.noreply.github.com>
Date: Sat, 18 Apr 2026 12:41:12 +0800
Subject: [PATCH 070/127] fix(layers): clamp similarity to [0,1] to avoid
 negative values

---
 mempalace/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/layers.py b/mempalace/layers.py
index b20c656..d549afe 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -287,7 +287,7 @@ def search(self, query: str, wing: str = None, room: str = None, n_results: int
         for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists), 1):
             meta = meta or {}
             doc = doc or ""
-            similarity = round(1 - dist, 3)
+            similarity = round(max(0.0, 1 - dist), 3)
             wing_name = meta.get("wing", "?")
             room_name = meta.get("room", "?")
             source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""

From b68485dfd4673848bcd44064a0c8764d89f6b48a Mon Sep 17 00:00:00 2001
From: Anthony Clendenen <anthony.clendenen@gmail.com>
Date: Thu, 23 Apr 2026 13:33:28 -0700
Subject: [PATCH 071/127] fix(closet_llm): reject non-http(s) endpoints

LLMConfig accepted any URL scheme from LLM_ENDPOINT / --endpoint,
so a misconfigured endpoint such as file:///etc/passwd would be
passed straight to urllib.request.urlopen. Validate the scheme at
construction time and raise ValueError on anything other than
http/https, preserving the "privacy by architecture" guarantee.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/closet_llm.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mempalace/closet_llm.py b/mempalace/closet_llm.py
index 6274f79..50000c8 100644
--- a/mempalace/closet_llm.py
+++ b/mempalace/closet_llm.py
@@ -40,6 +40,7 @@
 import os
 import re
 import time
+import urllib.parse
 import urllib.request
 import urllib.error
 from datetime import datetime
@@ -101,6 +102,14 @@ def __init__(
         self.endpoint = (endpoint or os.environ.get("LLM_ENDPOINT", "")).rstrip("/")
         self.key = key or os.environ.get("LLM_KEY", "")
         self.model = model or os.environ.get("LLM_MODEL", "")
+        if self.endpoint:
+            # Privacy-by-architecture: reject file:// and other non-HTTP schemes
+            # so a misconfigured endpoint cannot exfiltrate local files.
+            scheme = urllib.parse.urlparse(self.endpoint).scheme.lower()
+            if scheme not in ("http", "https"):
+                raise ValueError(
+                    f"LLM_ENDPOINT must use http:// or https:// (got scheme {scheme!r})"
+                )
 
     def missing(self) -> list:
         missing = []

From ca5899e361a1bc8823145f6d1efad22f22639409 Mon Sep 17 00:00:00 2001
From: Anthony Clendenen <anthony.clendenen@gmail.com>
Date: Thu, 23 Apr 2026 13:33:38 -0700
Subject: [PATCH 072/127] refactor: fix ruff bugbear and silent-except findings

- B904: chain OSError/collection errors with "raise ... from e" in
  normalize.py and searcher.py so the original traceback is preserved.
- B007: rename unused loop variables to _name in dedup, dialect, layers,
  and room_detector_local.
- S110/S112: replace bare "try/except/pass" and "try/except/continue"
  with logger.debug(..., exc_info=True) in mcp_server, searcher,
  palace, palace_graph, miner, convo_miner, and fact_checker so
  background failures are observable without changing behaviour.

A module-level logger ("mempalace_mcp", matching mcp_server/searcher)
is added to the five files that didn't already have one. Configured
ruff checks (E/F/W/C901) and ruff --select B, S110, S112 all pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/convo_miner.py         |  5 ++++-
 mempalace/dedup.py               |  2 +-
 mempalace/dialect.py             |  2 +-
 mempalace/fact_checker.py        |  4 ++++
 mempalace/layers.py              |  2 +-
 mempalace/mcp_server.py          |  4 ++--
 mempalace/miner.py               |  5 ++++-
 mempalace/normalize.py           |  4 ++--
 mempalace/palace.py              |  7 +++++--
 mempalace/palace_graph.py        |  2 +-
 mempalace/room_detector_local.py |  2 +-
 mempalace/searcher.py            | 10 ++++++----
 12 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/mempalace/convo_miner.py b/mempalace/convo_miner.py
index 2cf57e4..915b4d1 100644
--- a/mempalace/convo_miner.py
+++ b/mempalace/convo_miner.py
@@ -11,6 +11,7 @@
 import os
 import sys
 import hashlib
+import logging
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
@@ -24,6 +25,8 @@
     mine_lock,
 )
 
+logger = logging.getLogger("mempalace_mcp")
+
 
 # Cached hall keywords — avoids re-reading config per drawer
 _HALL_KEYWORDS_CACHE = None
@@ -331,7 +334,7 @@ def _file_chunks_locked(collection, source_file, chunks, wing, room, agent, extr
         try:
             collection.delete(where={"source_file": source_file})
         except Exception:
-            pass
+            logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
 
         # Batch chunks into bounded upserts so large transcripts keep most of
         # the embedding speedup without one huge Chroma/SQLite request. Keep
diff --git a/mempalace/dedup.py b/mempalace/dedup.py
index 6b1bac1..5e57aff 100644
--- a/mempalace/dedup.py
+++ b/mempalace/dedup.py
@@ -89,7 +89,7 @@ def dedup_source_group(col, drawer_ids, threshold=DEFAULT_THRESHOLD, dry_run=Tru
     kept = []
     to_delete = []
 
-    for did, doc, meta in items:
+    for did, doc, _meta in items:
         if not doc or len(doc) < 20:
             to_delete.append(did)
             continue
diff --git a/mempalace/dialect.py b/mempalace/dialect.py
index b72c52c..e6e214c 100644
--- a/mempalace/dialect.py
+++ b/mempalace/dialect.py
@@ -873,7 +873,7 @@ def generate_layer1(
 
         for date_key in sorted(by_date.keys()):
             lines.append(f"=MOMENTS[{date_key}]=")
-            for z, fnum in by_date[date_key]:
+            for z, _fnum in by_date[date_key]:
                 entities = []
                 for p in z.get("people", []):
                     code = self.encode_entity(p)
diff --git a/mempalace/fact_checker.py b/mempalace/fact_checker.py
index 403d913..8f1c3ba 100644
--- a/mempalace/fact_checker.py
+++ b/mempalace/fact_checker.py
@@ -27,6 +27,7 @@
 
 from __future__ import annotations
 
+import logging
 import os
 import re
 from datetime import datetime, timezone
@@ -35,6 +36,8 @@
 # ~/.mempalace/known_entities.json on every check_text call.
 from .miner import _load_known_entities_raw
 
+logger = logging.getLogger("mempalace_mcp")
+
 
 # Narrow detection patterns — parse "X is Y's Z" and "X's Z is Y".
 # Names are captured greedily as word sequences (letters + optional
@@ -214,6 +217,7 @@ def _check_kg_contradictions(text: str, palace_path: str) -> list:
         try:
             facts = kg.query_entity(subject, direction="outgoing")
         except Exception:
+            logger.debug("KG lookup failed for subject %r", subject, exc_info=True)
             continue
         if not facts:
             continue
diff --git a/mempalace/layers.py b/mempalace/layers.py
index d549afe..b92890a 100644
--- a/mempalace/layers.py
+++ b/mempalace/layers.py
@@ -157,7 +157,7 @@ def generate(self) -> str:
             lines.append(room_line)
             total_len += len(room_line)
 
-            for imp, meta, doc in entries:
+            for _imp, meta, doc in entries:
                 source = Path(meta.get("source_file", "")).name if meta.get("source_file") else ""
 
                 # Truncate doc to keep L1 compact
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 46982bb..58f9ba9 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -900,7 +900,7 @@ def tool_add_drawer(
         if existing and existing["ids"]:
             return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
     except Exception:
-        pass
+        logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
 
     try:
         col.upsert(
@@ -1418,7 +1418,7 @@ def tool_hook_settings(silent_save: bool = None, desktop_toast: bool = None):
     try:
         config = MempalaceConfig()
     except Exception:
-        pass
+        logger.debug("Could not re-read config after update", exc_info=True)
 
     result = {
         "success": True,
diff --git a/mempalace/miner.py b/mempalace/miner.py
index ba0c630..88734c9 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -12,6 +12,7 @@
 import shlex
 import hashlib
 import fnmatch
+import logging
 from pathlib import Path
 from datetime import datetime
 from collections import defaultdict
@@ -31,6 +32,8 @@
     upsert_closet_lines,
 )
 
+logger = logging.getLogger("mempalace_mcp")
+
 READABLE_EXTENSIONS = {
     ".txt",
     ".md",
@@ -842,7 +845,7 @@ def process_file(
         try:
             collection.delete(where={"source_file": source_file})
         except Exception:
-            pass
+            logger.debug("Stale-drawer purge failed for %s", source_file, exc_info=True)
 
         # Batch chunks into bounded upserts so the embedding model sees many
         # chunks per forward pass without building one huge Chroma/SQLite
diff --git a/mempalace/normalize.py b/mempalace/normalize.py
index 4252afa..ca62cca 100644
--- a/mempalace/normalize.py
+++ b/mempalace/normalize.py
@@ -118,14 +118,14 @@ def normalize(filepath: str) -> str:
     try:
         file_size = os.path.getsize(filepath)
     except OSError as e:
-        raise IOError(f"Could not read {filepath}: {e}")
+        raise IOError(f"Could not read {filepath}: {e}") from e
     if file_size > 500 * 1024 * 1024:  # 500 MB safety limit
         raise IOError(f"File too large ({file_size // (1024 * 1024)} MB): {filepath}")
     try:
         with open(filepath, "r", encoding="utf-8", errors="replace") as f:
             content = f.read()
     except OSError as e:
-        raise IOError(f"Could not read {filepath}: {e}")
+        raise IOError(f"Could not read {filepath}: {e}") from e
 
     if not content.strip():
         return content
diff --git a/mempalace/palace.py b/mempalace/palace.py
index 97f67ff..e5f6411 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -6,12 +6,15 @@
 
 import contextlib
 import hashlib
+import logging
 import os
 import re
 import threading
 
 from .backends.chroma import ChromaBackend
 
+logger = logging.getLogger("mempalace_mcp")
+
 SKIP_DIRS = {
     ".git",
     "node_modules",
@@ -229,7 +232,7 @@ def purge_file_closets(closets_col, source_file: str) -> None:
     try:
         closets_col.delete(where={"source_file": source_file})
     except Exception:
-        pass
+        logger.debug("Closet purge failed for %s", source_file, exc_info=True)
 
 
 def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
@@ -307,7 +310,7 @@ def mine_lock(source_file: str):
 
                 fcntl.flock(lf, fcntl.LOCK_UN)
         except Exception:
-            pass
+            logger.debug("Mine-lock release failed", exc_info=True)
         lf.close()
 
 
diff --git a/mempalace/palace_graph.py b/mempalace/palace_graph.py
index 3296cd5..0fff763 100644
--- a/mempalace/palace_graph.py
+++ b/mempalace/palace_graph.py
@@ -575,7 +575,7 @@ def follow_tunnels(wing: str, room: str, col=None, config=None):
                     if did and did in drawer_map:
                         c["drawer_preview"] = drawer_map[did][:300]
             except Exception:
-                pass
+                logger.debug("Drawer preview hydration failed", exc_info=True)
 
     return connections
 
diff --git a/mempalace/room_detector_local.py b/mempalace/room_detector_local.py
index 31d5b05..8e3fc20 100644
--- a/mempalace/room_detector_local.py
+++ b/mempalace/room_detector_local.py
@@ -202,7 +202,7 @@ def detect_rooms_from_files(project_dir: str) -> list:
 
     SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build"}
 
-    for root, dirs, filenames in os.walk(project_path):
+    for _root, dirs, filenames in os.walk(project_path):
         dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
         for filename in filenames:
             name_lower = filename.lower().replace("-", "_").replace(" ", "_")
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index ddddc46..536610e 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -245,7 +245,7 @@ def _expand_with_neighbors(drawers_col, matched_doc: str, matched_meta: dict, ra
         all_meta = drawers_col.get(where={"source_file": src}, include=["metadatas"])
         total_drawers = len(all_meta.ids) if all_meta.ids else None
     except Exception:
-        pass
+        logger.debug("total_drawers lookup failed for %s", src, exc_info=True)
 
     return {
         "text": combined_text,
@@ -297,10 +297,10 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
     """
     try:
         col = get_collection(palace_path, create=False)
-    except Exception:
+    except Exception as e:
         print(f"\n  No palace found at {palace_path}")
         print("  Run: mempalace init <dir> then mempalace mine <dir>")
-        raise SearchError(f"No palace found at {palace_path}")
+        raise SearchError(f"No palace found at {palace_path}") from e
 
     # Alert the user if this palace predates hnsw:space=cosine being set on
     # creation — their similarity scores will be junk until they run repair.
@@ -795,7 +795,8 @@ def search_memories(
             if source and source not in closet_boost_by_source:
                 closet_boost_by_source[source] = (rank, cdist, cdoc[:200])
     except Exception:
-        pass  # no closets yet — hybrid degrades to pure drawer search
+        # No closets yet — hybrid degrades to pure drawer search.
+        logger.debug("Closet collection unavailable; using drawer-only search", exc_info=True)
 
     # Rank-based boost. The ordinal signal ("which closet matched best") is
     # more reliable than absolute distance on narrative content, where
@@ -877,6 +878,7 @@ def search_memories(
                 include=["documents", "metadatas"],
             )
         except Exception:
+            logger.debug("Neighbor fetch failed for %s", full_source, exc_info=True)
             continue
         docs = source_drawers.documents
         metas_ = source_drawers.metadatas

From a7c4ed24d7fd222cafb25af915fa3cef87e92b27 Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 00:25:46 -0500
Subject: [PATCH 073/127] fix(repair): add --mode from-sqlite to recover
 palaces with corrupt HNSW (#1308)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both `--mode legacy` and the inline `cli.cmd_repair` rebuild path
call `Collection.count()` as their first read — the same call that
raises `chromadb.errors.InternalError: Failed to apply logs to the
hnsw segment writer` on the corruption class reported in #1308.
Repair would print "Cannot recover — palace may need to be re-mined
from source files" even though the underlying SQLite tables were
fully intact.

The new `--mode from-sqlite` reads `(id, document, metadata)` rows
directly from `chroma.sqlite3` via `segments` → `embeddings` →
`embedding_metadata` joins, never opens a chromadb client against
the corrupt palace, and re-upserts everything into a fresh palace.

  - `--source PATH` extracts from a corrupt palace already moved aside
  - `--archive-existing` handles the in-place case by renaming the
    existing palace to `<palace>.pre-rebuild-<timestamp>` first
  - Partial-rebuild failures raise `RebuildPartialError` with the
    archive path so users can recover; CLI exits non-zero
  - In-place mode calls `SharedSystemClient.clear_system_cache()` to
    drop chromadb's process-wide System registry (cross-palace use
    does not, to limit blast radius for library callers)
  - Source validation runs before any destructive moves

Verified end-to-end recovering a 52,300-row real-world corrupt
palace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md         |   1 +
 mempalace/cli.py     |  68 +++++++-
 mempalace/repair.py  | 358 ++++++++++++++++++++++++++++++++++++++++++-
 tests/test_repair.py | 291 +++++++++++++++++++++++++++++++++++
 4 files changed, 714 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3011aab..1972c03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 - **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
 - **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)
 - **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
+- **`mempalace repair` can now recover palaces whose HNSW segment writer is stuck on `apply_logs`.** Both the existing `--mode legacy` rebuild and the inline `cli.cmd_repair` path call `Collection.count()` as their first read — exactly the call that raises `chromadb.errors.InternalError: Failed to apply logs to the hnsw segment writer` on the corruption class introduced upstream and reported in #1308. Repair would print `Cannot recover — palace may need to be re-mined from source files` even though the underlying SQLite tables were fully intact (the corruption lives in the on-disk index files, not the data layer). New `--mode from-sqlite` reads `(id, document, metadata)` rows directly from `chroma.sqlite3` via a `segments` → `embeddings` → `embedding_metadata` join, never opens a chromadb client against the corrupt palace, and re-upserts everything into a fresh palace at `--palace`. `--source PATH` extracts from a corrupt palace already moved aside; `--archive-existing` handles the in-place case by renaming the existing palace to `<palace>.pre-rebuild-<timestamp>` before reading from it. Documents are re-embedded under the user's configured embedding function (the original HNSW vectors live in the corrupt `data_level0.bin` and cannot be recovered, but the embedding model is deterministic so search results remain semantically equivalent). Verified end-to-end on a 52,300-row real-world corrupt palace. (#1308)
 
 ---
 
diff --git a/mempalace/cli.py b/mempalace/cli.py
index 0ab3d0f..468b765 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -673,6 +673,48 @@ def cmd_repair(args):
         )
         return
 
+    if getattr(args, "mode", "legacy") == "from-sqlite":
+        from .migrate import confirm_destructive_action
+        from .repair import RebuildPartialError, rebuild_from_sqlite
+
+        source_path = getattr(args, "source", None)
+        source_path = (
+            os.path.abspath(os.path.expanduser(source_path)) if source_path else palace_path
+        )
+        archive_existing = getattr(args, "archive_existing", False)
+
+        # Gate any path that touches the user's existing palace dir
+        # behind confirm_destructive_action. The legacy mode already
+        # gates; from-sqlite needs the same protection because:
+        # (a) --archive-existing renames the existing palace,
+        # (b) --source PATH writes into --palace dir which the user
+        #     may not realize is also a palace.
+        # No prompt when source != dest AND dest does not exist (pure
+        # extract-into-fresh-dir case is non-destructive to existing
+        # palaces).
+        is_destructive_to_dest = source_path == palace_path or os.path.exists(palace_path)
+        if is_destructive_to_dest and not confirm_destructive_action(
+            "Rebuild from SQLite", palace_path, assume_yes=getattr(args, "yes", False)
+        ):
+            return
+
+        try:
+            rebuild_from_sqlite(
+                source_palace=source_path,
+                dest_palace=palace_path,
+                archive_existing_dest=archive_existing,
+            )
+        except RebuildPartialError as exc:
+            # The error itself was already printed by rebuild_from_sqlite
+            # with recovery instructions; surface a non-zero exit so
+            # scripts and CI gates see the failure.
+            print(
+                "\n  Rebuild partial — see message above. "
+                f"Failed in collection: {exc.failed_collection}"
+            )
+            sys.exit(1)
+        return
+
     db_path = os.path.join(palace_path, "chroma.sqlite3")
 
     if not os.path.isdir(palace_path):
@@ -1213,11 +1255,31 @@ def main():
     )
     p_repair.add_argument(
         "--mode",
-        choices=["legacy", "max-seq-id"],
+        choices=["legacy", "max-seq-id", "from-sqlite"],
         default="legacy",
         help=(
-            "legacy: full-palace rebuild (default). "
-            "max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim."
+            "legacy: full-palace rebuild via the chromadb client (default). "
+            "max-seq-id: un-poison max_seq_id rows corrupted by the legacy 0.6.x shim. "
+            "from-sqlite: rebuild by reading rows directly from chroma.sqlite3, "
+            "bypassing the chromadb client. Use when legacy mode bails because the "
+            "chromadb client cannot open the collection."
+        ),
+    )
+    p_repair.add_argument(
+        "--source",
+        default=None,
+        help=(
+            "Source palace path for --mode from-sqlite (defaults to --palace). "
+            "Use when extracting from an archived corrupt palace into a new location."
+        ),
+    )
+    p_repair.add_argument(
+        "--archive-existing",
+        action="store_true",
+        help=(
+            "For --mode from-sqlite when --source equals --palace: rename the "
+            "existing palace to <palace>.pre-rebuild-<timestamp> before "
+            "rebuilding so the corrupt copy is preserved."
         ),
     )
     p_repair.add_argument(
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 1cd1556..7e98f0f 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -34,14 +34,21 @@
 import shutil
 import sqlite3
 import time
+from collections import defaultdict
 from datetime import datetime
-from typing import Optional
+from typing import Iterator, Optional
 
 from .backends.chroma import ChromaBackend, hnsw_capacity_status
 
 
 COLLECTION_NAME = "mempalace_drawers"
 
+# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
+# upsert pass — drawers carry the bulk of the data, closets are the AAAK
+# index layer and reference drawer IDs by string in their documents (no
+# foreign-key validation, so ordering is informational, not load-bearing).
+RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
+
 
 def _get_palace_path():
     """Resolve palace path from config."""
@@ -436,6 +443,355 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     print(f"\n{'=' * 55}\n")
 
 
+class RebuildPartialError(Exception):
+    """Raised when ``rebuild_from_sqlite`` fails partway through upserts.
+
+    Carries enough state for the user (or CLI) to recover: the
+    per-collection counts that succeeded, the collection that failed,
+    the dest path holding the partial palace, and the archive path
+    (when an in-place rebuild had moved the original aside). Re-raises
+    the underlying chromadb error as ``__cause__``.
+    """
+
+    def __init__(
+        self,
+        message: str,
+        *,
+        partial_counts: dict[str, int],
+        failed_collection: str,
+        dest_palace: str,
+        archive_path: Optional[str],
+    ):
+        super().__init__(message)
+        self.message = message
+        self.partial_counts = partial_counts
+        self.failed_collection = failed_collection
+        self.dest_palace = dest_palace
+        self.archive_path = archive_path
+
+
+def _rebuild_one_collection(
+    *,
+    backend: ChromaBackend,
+    source_palace: str,
+    dest_palace: str,
+    collection_name: str,
+    batch_size: int,
+    archive_path: Optional[str],
+    counts_so_far: dict[str, int],
+) -> int:
+    """Stream rows for one collection from SQLite and upsert into a
+    freshly-created collection at ``dest_palace``. Returns rows
+    upserted. Raises :class:`RebuildPartialError` (with the underlying
+    chromadb exception as ``__cause__``) on any upsert failure so the
+    caller can stop the loop and print recovery instructions instead of
+    silently shipping a partial palace.
+    """
+    col = backend.create_collection(dest_palace, collection_name)
+
+    ids: list[str] = []
+    docs: list[str] = []
+    metas: list[dict] = []
+    upserted = 0
+
+    def _flush() -> int:
+        nonlocal upserted
+        if not ids:
+            return upserted
+        col.upsert(ids=list(ids), documents=list(docs), metadatas=list(metas))
+        upserted += len(ids)
+        print(f"    upserted {upserted}")
+        ids.clear()
+        docs.clear()
+        metas.clear()
+        return upserted
+
+    try:
+        for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
+            ids.append(emb_id)
+            docs.append(doc or "")
+            # chromadb 1.5.x rejects None entries in the metadatas list
+            # but accepts empty dicts. Mempalace drawers always carry at
+            # least wing/room, so this branch is defensive — corruption
+            # in embedding_metadata could yield an emb_id with no rows.
+            metas.append(meta if meta else {})
+            if len(ids) >= batch_size:
+                _flush()
+        _flush()
+    except Exception as exc:  # noqa: BLE001 — chromadb raises many shapes
+        partial = dict(counts_so_far)
+        partial[collection_name] = upserted
+        msg_parts = [
+            f"Upsert failed in collection {collection_name!r} after {upserted} rows: {exc!r}",
+            f"Partial palace left at: {dest_palace}",
+        ]
+        if archive_path is not None:
+            msg_parts.append(f"Original palace archived at: {archive_path}")
+            msg_parts.append(
+                "  Recover by removing the partial dest and re-running with "
+                f"--source {archive_path}"
+            )
+        else:
+            msg_parts.append("  Source palace is unchanged. Remove the partial dest and re-run.")
+        message = "\n  ".join(msg_parts)
+        print(f"\n  ERROR: {message}")
+        raise RebuildPartialError(
+            message,
+            partial_counts=partial,
+            failed_collection=collection_name,
+            dest_palace=dest_palace,
+            archive_path=archive_path,
+        ) from exc
+
+    return upserted
+
+
+def extract_via_sqlite(palace_path: str, collection_name: str) -> Iterator[tuple[str, str, dict]]:
+    """Yield ``(embedding_id, document, metadata)`` for every row in
+    ``collection_name``'s metadata segment by reading ``chroma.sqlite3``
+    directly.
+
+    Bypasses the chromadb client entirely — never opens a
+    ``PersistentClient``, never imports hnswlib, never invokes the
+    HNSW segment writer. This is the recovery path for palaces where
+    ``Collection.count()`` / ``Collection.get()`` raise ``InternalError``
+    because the compactor cannot apply WAL logs to the HNSW segment
+    (#1308). The drawer rows are still on disk in
+    ``embeddings`` + ``embedding_metadata``; the corruption lives in the
+    on-disk index files, not the SQLite tables.
+
+    Resolution rule for chromadb's typed metadata columns: each
+    ``embedding_metadata`` row stores its value in exactly one of
+    ``string_value`` / ``int_value`` / ``float_value`` / ``bool_value``;
+    we pick the first non-NULL column in that order. Rows where every
+    typed column is NULL are dropped (chromadb never writes that shape).
+    The ``chroma:document`` key is removed from the metadata dict and
+    returned as the document; this matches how chromadb itself stores
+    ``add(documents=...)``.
+
+    Silent on missing palace, missing ``chroma.sqlite3``, or unknown
+    collection name — yields nothing. Callers that need to distinguish
+    "empty collection" from "collection not present" should query
+    :func:`sqlite_drawer_count` first.
+    """
+    sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
+    if not os.path.isfile(sqlite_path):
+        return
+
+    conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+    try:
+        seg_row = conn.execute(
+            """
+            SELECT s.id FROM segments s
+            JOIN collections c ON s.collection = c.id
+            WHERE c.name = ? AND s.scope = 'METADATA'
+            """,
+            (collection_name,),
+        ).fetchone()
+        if not seg_row:
+            return
+        segment_id = seg_row[0]
+
+        per_id: dict[str, dict] = defaultdict(dict)
+        order: list[str] = []
+        for emb_id, key, sv, iv, fv, bv in conn.execute(
+            """
+            SELECT e.embedding_id, em.key, em.string_value, em.int_value,
+                   em.float_value, em.bool_value
+            FROM embedding_metadata em
+            JOIN embeddings e ON em.id = e.id
+            WHERE e.segment_id = ?
+            ORDER BY em.id
+            """,
+            (segment_id,),
+        ):
+            if emb_id not in per_id:
+                order.append(emb_id)
+            if sv is not None:
+                per_id[emb_id][key] = sv
+            elif iv is not None:
+                per_id[emb_id][key] = iv
+            elif fv is not None:
+                per_id[emb_id][key] = fv
+            elif bv is not None:
+                per_id[emb_id][key] = bool(bv)
+
+        for emb_id in order:
+            kv = per_id[emb_id]
+            doc = kv.pop("chroma:document", "")
+            yield emb_id, doc, kv
+    finally:
+        conn.close()
+
+
+def rebuild_from_sqlite(
+    source_palace: str,
+    dest_palace: str,
+    *,
+    archive_existing_dest: bool = False,
+    batch_size: int = 1000,
+) -> dict[str, int]:
+    """Rebuild a palace by reading drawers from ``source_palace``'s
+    ``chroma.sqlite3`` and upserting them into a fresh palace at
+    ``dest_palace``.
+
+    Recovery path for the #1308 failure mode: the chromadb client raises
+    ``InternalError: Failed to apply logs to the hnsw segment writer``
+    on every operation that touches the index (``count``, ``get``,
+    ``query``), but the underlying SQLite tables are intact. Both the
+    legacy ``rebuild_index`` and the inline ``cli.cmd_repair`` path call
+    ``Collection.count()`` as their first read — exactly the call that
+    fails — so neither can recover this class of corruption. This
+    function bypasses the chromadb read path entirely via
+    :func:`extract_via_sqlite`.
+
+    Re-embeds documents at upsert time using the configured embedding
+    function; the original HNSW vectors are not preserved (they live in
+    the corrupt ``data_level0.bin`` / ``link_lists.bin``, not in
+    SQLite). Acceptable for a corruption-recovery flow because the
+    embedding model is deterministic — same model + same document text
+    yields semantically equivalent search results.
+
+    ``archive_existing_dest`` controls behavior when ``dest_palace``
+    already exists:
+
+    * ``False`` (default) — refuse with a clear message. Callers must
+      manually move the existing palace aside first.
+    * ``True`` — rename ``dest_palace`` to
+      ``<dest_palace>.pre-rebuild-<timestamp>`` and read from there
+      instead. Used by the in-place CLI flow where ``--source`` defaults
+      to the same path as ``--palace``.
+
+    Returns a ``{collection_name: row_count}`` dict so callers (CLI,
+    tests) can verify the per-collection rebuild count without parsing
+    stdout. Returns ``{}`` on validation failures (missing source,
+    refusing to overwrite). Raises :class:`RebuildPartialError` if a
+    chromadb upsert fails partway through; the dest palace is left in
+    place so the user can inspect what landed, and the in-place archive
+    (when applicable) is reported in the error so the user can re-run
+    against it.
+
+    .. warning::
+
+       In-place mode (``source_palace == dest_palace`` with
+       ``archive_existing_dest=True``) calls
+       ``chromadb.api.client.SharedSystemClient.clear_system_cache()`` to
+       drop chromadb's process-wide System registry — required because
+       an existing cached System built against the original palace will
+       refuse ``create_collection`` after the dir is renamed (chromadb
+       still thinks the collections exist). This invalidates any
+       PersistentClient instances held elsewhere in the same process for
+       *any* palace, not just this one. Do not call this function from
+       inside a long-running mempalace process (MCP server, daemon)
+       while other callers hold live ``PersistentClient`` references —
+       use the CLI in a separate process instead. Cross-palace use
+       (``source != dest``) does not touch the cache.
+
+    Note on metadata fidelity: the resolution rule
+    (``string_value`` → ``int_value`` → ``float_value`` → ``bool_value``)
+    matches the precedent in :mod:`mempalace.migrate`. ChromaDB 0.4.x
+    occasionally wrote booleans as ``int_value=0/1``; those will
+    round-trip as ``int`` rather than ``bool`` after this rebuild. This
+    is a known divergence and matches the existing migrate-path
+    behavior.
+    """
+    source_palace = os.path.abspath(os.path.expanduser(source_palace))
+    dest_palace = os.path.abspath(os.path.expanduser(dest_palace))
+
+    src_db = os.path.join(source_palace, "chroma.sqlite3")
+
+    in_place = source_palace == dest_palace
+
+    print(f"\n{'=' * 55}")
+    print("  MemPalace Repair — Rebuild from SQLite")
+    print(f"{'=' * 55}\n")
+    print(f"  Source: {source_palace}")
+    print(f"  Dest:   {dest_palace}")
+
+    # Validate source BEFORE any destructive moves. An earlier draft
+    # archived the dest first and surfaced the missing-chroma.sqlite3
+    # error after — leaving the user with a renamed dir to manually undo
+    # when the archive itself was empty. Validate first so a user error
+    # (--source pointing at a non-palace dir) bails cleanly.
+    if in_place:
+        if not archive_existing_dest:
+            print(
+                "\n  Source and dest are the same path. Pass "
+                "archive_existing_dest=True (CLI: --archive-existing) to move "
+                "the existing palace aside, or pass a different source_palace= "
+                "(CLI: --source)."
+            )
+            return {}
+        if not os.path.isfile(src_db):
+            print(f"\n  Source palace has no chroma.sqlite3 at {src_db}")
+            return {}
+    else:
+        if not os.path.isfile(src_db):
+            print(f"\n  Source palace has no chroma.sqlite3 at {src_db}")
+            return {}
+        if os.path.exists(dest_palace):
+            print(
+                f"\n  Refusing to rebuild into existing path: {dest_palace}\n"
+                "  Move it aside, pass a different dest, or set "
+                "archive_existing_dest=True if rebuilding in place "
+                "(source_palace == dest_palace)."
+            )
+            return {}
+
+    archive_path: Optional[str] = None
+    if in_place:
+        ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+        archive_path = f"{dest_palace}.pre-rebuild-{ts}"
+        print(f"  Archiving {dest_palace} → {archive_path}")
+        shutil.move(dest_palace, archive_path)
+        source_palace = archive_path
+        src_db = os.path.join(source_palace, "chroma.sqlite3")
+
+        # In-place only: drop chromadb's process-wide System registry so
+        # the new client at dest_palace builds a fresh System. Without
+        # this, ``create_collection`` raises "Collection already exists"
+        # because the cached System still holds the pre-rename schema.
+        # Cross-palace mode does not need this and would needlessly
+        # invalidate other callers' clients (see docstring warning).
+        try:
+            from chromadb.api.client import SharedSystemClient
+
+            SharedSystemClient.clear_system_cache()
+        except Exception as exc:  # noqa: BLE001
+            print(
+                f"  Warning: could not clear chromadb system cache ({exc!r}); "
+                "in-place rebuild may fail with 'Collection already exists'."
+            )
+
+    os.makedirs(dest_palace, exist_ok=True)
+
+    backend = ChromaBackend()
+    counts: dict[str, int] = {}
+
+    for cname in RECOVERABLE_COLLECTIONS:
+        print(f"\n  [{cname}]")
+        upserted = _rebuild_one_collection(
+            backend=backend,
+            source_palace=source_palace,
+            dest_palace=dest_palace,
+            collection_name=cname,
+            batch_size=batch_size,
+            archive_path=archive_path,
+            counts_so_far=counts,
+        )
+        counts[cname] = upserted
+        if upserted == 0:
+            print(f"    no rows found for {cname} in source palace")
+        else:
+            print(f"    done: {upserted} rows in {cname}")
+
+    print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
+    if archive_path is not None:
+        print(f"  Original palace archived at: {archive_path}")
+    print(f"{'=' * 55}\n")
+    return counts
+
+
 def status(palace_path=None) -> dict:
     """Read-only health check: compare sqlite vs HNSW element counts.
 
diff --git a/tests/test_repair.py b/tests/test_repair.py
index bc770dd..35e6a44 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -682,3 +682,294 @@ def flaky_detect(*args, **kwargs):
     # A backup file is still present — caller can roll back from it.
     leftover = [fn for fn in os.listdir(palace) if "max-seq-id-backup-" in fn]
     assert leftover
+
+
+# ── extract_via_sqlite + rebuild_from_sqlite (#1308) ──────────────────
+#
+# These tests build real chromadb palaces in tmp_path rather than mocking
+# the SQLite layer. The bug class they guard against is "extraction sees
+# different rows than chromadb stored" — the only honest check is to let
+# chromadb actually write rows and then read them back via the SQLite
+# bypass. Mocking the SQLite cursor would defeat the test.
+
+
+def _seed_palace(palace_path, collection_name, rows):
+    """Build a real chromadb palace at ``palace_path`` and add ``rows``.
+
+    ``rows`` is a list of ``(id, document, metadata)`` tuples. Returns
+    the populated collection so callers can assert on the writer's view
+    of state before the SQLite read.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    backend = ChromaBackend()
+    col = backend.create_collection(str(palace_path), collection_name)
+    col.upsert(
+        ids=[r[0] for r in rows],
+        documents=[r[1] for r in rows],
+        metadatas=[r[2] for r in rows],
+    )
+    return col
+
+
+def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
+    """Round-trip: a chromadb palace with N upserted rows returns those
+    same N rows when read via the SQLite bypass.
+
+    Catches: anyone who breaks the segments/embeddings/embedding_metadata
+    JOIN, swaps the metadata vs vector segment, or changes how the
+    document is stored under the ``chroma:document`` key.
+    """
+    rows = [
+        (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
+        for i in range(25)
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+
+    assert len(extracted) == 25
+    by_id = {emb_id: (doc, meta) for emb_id, doc, meta in extracted}
+    assert set(by_id) == {r[0] for r in rows}
+    for emb_id, doc, meta in rows:
+        got_doc, got_meta = by_id[emb_id]
+        assert got_doc == doc, f"document mangled for {emb_id}"
+        assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
+
+
+def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
+    """Chromadb stores int / float / bool / string in distinct typed
+    columns. Extraction must round-trip the original type, not coerce
+    everything to string.
+
+    Catches: a regression where the SELECT order changes and ints come
+    back as None, or where the column-resolution rule prefers the wrong
+    column.
+    """
+    rows = [
+        (
+            "drawer_typed",
+            "doc",
+            {
+                "wing": "w",
+                "chunk_index": 7,  # int
+                "score": 0.42,  # float
+                "is_active": True,  # bool
+            },
+        ),
+    ]
+    _seed_palace(tmp_path, "mempalace_drawers", rows)
+
+    extracted = list(repair.extract_via_sqlite(str(tmp_path), "mempalace_drawers"))
+    assert len(extracted) == 1
+    _, _, meta = extracted[0]
+
+    assert meta["chunk_index"] == 7 and isinstance(meta["chunk_index"], int)
+    assert meta["score"] == 0.42 and isinstance(meta["score"], float)
+    assert meta["is_active"] is True
+    assert meta["wing"] == "w"
+
+
+def test_extract_via_sqlite_unknown_collection_yields_nothing(tmp_path):
+    """Asking for a collection that isn't in the palace must return an
+    empty iterator, not silently fall back to another collection's
+    metadata segment. Seeds two real collections and queries for a third
+    name so a regression that drops the WHERE c.name=? filter would leak
+    rows from the seeded collections rather than passing.
+    """
+    _seed_palace(tmp_path, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    _seed_palace(tmp_path, "mempalace_closets", [("c1", "abbrev", {"wing": "w"})])
+    assert list(repair.extract_via_sqlite(str(tmp_path), "not_a_real_collection")) == []
+
+
+def test_extract_via_sqlite_missing_palace_yields_nothing(tmp_path):
+    """No chroma.sqlite3 → empty iterator, no exception. Callers depend
+    on this when probing speculatively."""
+    empty = tmp_path / "no_palace_here"
+    empty.mkdir()
+    assert list(repair.extract_via_sqlite(str(empty), "mempalace_drawers")) == []
+
+
+def test_rebuild_from_sqlite_roundtrips_via_real_chromadb(tmp_path):
+    """End-to-end: seed source palace, rebuild into a fresh dest, then
+    open dest with a fresh ChromaBackend and verify ``count()`` and
+    metadata filters return the original rows. Also asserts a closet
+    document round-trips so a future regression that re-embeds with the
+    wrong EF or swaps drawer/closet content would fail here.
+
+    This is the single most important regression guard. If
+    ``rebuild_from_sqlite`` silently drops rows or mangles metadata, no
+    other test in this file would catch it because they all stop at the
+    extraction layer.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    rows = [
+        (f"drawer_{i:03d}", f"body {i}", {"wing": "alpha" if i % 2 else "beta", "room": "r0"})
+        for i in range(40)
+    ]
+    _seed_palace(source, "mempalace_drawers", rows)
+    _seed_palace(
+        source,
+        "mempalace_closets",
+        [("closet_x", "abbrev pointer →drawer_001", {"wing": "alpha"})],
+    )
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {"mempalace_drawers": 40, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    drawers = backend.get_collection(str(dest), "mempalace_drawers")
+    assert drawers.count() == 40
+    alpha = drawers.get(where={"wing": "alpha"})
+    assert len(alpha["ids"]) == 20
+
+    # Spot-check that document text round-trips for one specific drawer
+    # — protects against a regression where extraction or upsert order
+    # silently swaps document bodies between IDs.
+    one = drawers.get(ids=["drawer_007"], include=["documents", "metadatas"])
+    assert one["documents"] == ["body 7"]
+    assert one["metadatas"][0]["wing"] == "alpha"
+
+    # Closets: the AAAK index layer. Re-embedded with the same EF so a
+    # known closet ID and its document body must come back intact.
+    closets = backend.get_collection(str(dest), "mempalace_closets")
+    assert closets.count() == 1
+    closet_row = closets.get(ids=["closet_x"], include=["documents", "metadatas"])
+    assert closet_row["documents"] == ["abbrev pointer →drawer_001"]
+    assert closet_row["metadatas"][0] == {"wing": "alpha"}
+
+
+def test_rebuild_from_sqlite_refuses_existing_dest(tmp_path):
+    """Refuse to write into a directory that already exists when source
+    and dest differ. Without this, an unattended re-run would silently
+    interleave a partial rebuild with whatever's already at dest.
+    """
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+    _seed_palace(source, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    dest.mkdir()
+    # Drop a marker file so we can prove the dir wasn't touched.
+    (dest / "marker.txt").write_text("preexisting")
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert (dest / "marker.txt").read_text() == "preexisting"
+    assert not (dest / "chroma.sqlite3").exists()
+
+
+def test_rebuild_from_sqlite_in_place_archives_when_opted_in(tmp_path):
+    """In-place rebuild (source == dest) with ``archive_existing_dest=True``
+    must move the original aside to ``<dest>.pre-rebuild-<ts>`` and read
+    from the archive — the original drawer rows must survive in the new
+    palace, AND the archive itself must still contain the original rows.
+
+    Catches: a refactor that moves the original out but then reads from
+    the now-empty original location, producing an empty rebuild; also
+    catches a swap that empties the archive after reading.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(15)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts["mempalace_drawers"] == 15
+
+    archives = [p for p in tmp_path.iterdir() if p.name.startswith("palace.pre-rebuild-")]
+    assert len(archives) == 1
+    assert (archives[0] / "chroma.sqlite3").exists()
+    # Archive must still hold the same row count via the SQLite bypass —
+    # proves the archive wasn't silently truncated as a side effect.
+    archived_rows = list(repair.extract_via_sqlite(str(archives[0]), "mempalace_drawers"))
+    assert len(archived_rows) == 15
+
+    from mempalace.backends.chroma import ChromaBackend
+
+    rebuilt = ChromaBackend().get_collection(str(palace), "mempalace_drawers")
+    assert rebuilt.count() == 15
+
+
+def test_rebuild_from_sqlite_in_place_refuses_without_archive_flag(tmp_path):
+    """Source == dest without archive flag must abort untouched. The
+    most catastrophic possible regression of this code path is silently
+    deleting the only copy of the user's data."""
+    palace = tmp_path / "palace"
+    _seed_palace(palace, "mempalace_drawers", [("d1", "doc", {"wing": "w"})])
+    sqlite_before = (palace / "chroma.sqlite3").stat().st_size
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace))
+    assert counts == {}
+    # Same file, untouched.
+    assert (palace / "chroma.sqlite3").stat().st_size == sqlite_before
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_source_missing_chroma_db(tmp_path):
+    """Source dir exists but has no chroma.sqlite3 → returns empty,
+    leaves dest untouched."""
+    source = tmp_path / "source"
+    source.mkdir()
+    (source / "stray_file").write_text("not a palace")
+    dest = tmp_path / "dest"
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+    assert counts == {}
+    assert not dest.exists()
+
+
+def test_rebuild_from_sqlite_in_place_validates_source_before_archiving(tmp_path):
+    """In-place + archive_existing_dest=True with a dir that lacks
+    chroma.sqlite3 must NOT rename the dir before bailing. An earlier
+    revision archived first and validated second, leaving the user with
+    a renamed empty dir to manually undo. Catches that ordering bug.
+    """
+    palace = tmp_path / "palace"
+    palace.mkdir()
+    (palace / "marker.txt").write_text("not a real palace")
+
+    counts = repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+    assert counts == {}
+    # No archive created — original dir still in place with its marker.
+    assert palace.exists()
+    assert (palace / "marker.txt").read_text() == "not a real palace"
+    archives = [p for p in tmp_path.iterdir() if "pre-rebuild" in p.name]
+    assert archives == []
+
+
+def test_rebuild_from_sqlite_raises_on_upsert_failure(tmp_path, monkeypatch):
+    """Mid-batch upsert failure must raise ``RebuildPartialError`` and
+    surface the failed collection + archive path so the user can recover.
+    Without this, an unattended script gets exit-code-zero on a partial
+    rebuild and the user discovers the data loss only when search starts
+    returning fewer hits.
+    """
+    palace = tmp_path / "palace"
+    rows = [(f"d{i}", f"body {i}", {"wing": "w", "room": "r"}) for i in range(5)]
+    _seed_palace(palace, "mempalace_drawers", rows)
+
+    # Make the very first upsert raise so we don't depend on batch
+    # boundary behavior. Patching ChromaCollection.upsert (the wrapper
+    # mempalace's backend returns) keeps the failure path realistic.
+    # ``monkeypatch`` is pytest's built-in fixture that auto-restores
+    # the original attribute when the test exits, so we don't need to
+    # undo this manually.
+    from mempalace.backends.chroma import ChromaCollection
+
+    def boom(self, **kwargs):
+        raise RuntimeError("simulated chromadb upsert failure")
+
+    monkeypatch.setattr(ChromaCollection, "upsert", boom)
+
+    with pytest.raises(repair.RebuildPartialError) as excinfo:
+        repair.rebuild_from_sqlite(str(palace), str(palace), archive_existing_dest=True)
+
+    err = excinfo.value
+    assert err.failed_collection == "mempalace_drawers"
+    assert err.partial_counts.get("mempalace_drawers") == 0
+    assert err.archive_path is not None
+    assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
+    assert err.dest_palace == os.path.abspath(str(palace))

From cb6bfd523149f976ae10264c80a8dd4ff733b25c Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 12:11:52 -0500
Subject: [PATCH 074/127] chore: gitignore .envrc for direnv users

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 1619ba8..ba8ec10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ venv/
 
 # ChromaDB local data
 *.sqlite3-journal
+.envrc

From d92c741084c7a3d276ecd31c483931ff468bdd35 Mon Sep 17 00:00:00 2001
From: Brian potter <brian@potterdigital.com>
Date: Sat, 2 May 2026 12:12:08 -0500
Subject: [PATCH 075/127] fix(repair): address PR #1310 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Five small hardening fixes for the from-sqlite rebuild path, all from
mjc's review on #1310:

- repair.py: drawers collection name now resolves from
  MempalaceConfig().collection_name via _drawers_collection_name() (closets
  stays fixed by design — AAAK index references drawer IDs by string).
  Lines up with the broader configured-collection work in #1312 so that
  PR can rebase cleanly on top.
- repair.py: create_collection() moved inside the try block in
  _rebuild_one_collection so a Chroma "Collection already exists" failure
  surfaces as RebuildPartialError with archive_path, not an unstructured
  exception that strands the user without recovery instructions.
- repair.py: rebuild_from_sqlite wraps backend lifetime in try/finally
  with backend.close() so PersistentClient handles to dest_palace are
  released on every exit path. The from-sqlite path post-dates #1285's
  lifecycle hardening of the legacy rebuild, so this needed its own
  cleanup.
- cli.py: cmd_repair (from-sqlite mode) now exits non-zero when
  rebuild_from_sqlite returns {} (validation refusal sentinel), so
  unattended scripts/CI distinguish "invalid inputs" from a successful
  rebuild that legitimately found zero rows.
- tests/test_repair.py: test_extract_via_sqlite_returns_all_rows_with_metadata
  now asserts every backing segment is scope='METADATA', locking in the
  segment-layout assumption against future regressions that point the
  JOIN at the VECTOR segment.

New test coverage:
- test_rebuild_from_sqlite_honors_configured_drawer_collection_name
- test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero
- test_cmd_repair_from_sqlite_success_does_not_exit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 mempalace/cli.py     |  11 +++-
 mempalace/repair.py  | 121 ++++++++++++++++++++++++++++++++-----------
 tests/test_cli.py    |  61 ++++++++++++++++++++++
 tests/test_repair.py |  89 +++++++++++++++++++++++++++++++
 4 files changed, 250 insertions(+), 32 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 468b765..95915eb 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -699,7 +699,7 @@ def cmd_repair(args):
             return
 
         try:
-            rebuild_from_sqlite(
+            counts = rebuild_from_sqlite(
                 source_palace=source_path,
                 dest_palace=palace_path,
                 archive_existing_dest=archive_existing,
@@ -713,6 +713,15 @@ def cmd_repair(args):
                 f"Failed in collection: {exc.failed_collection}"
             )
             sys.exit(1)
+        # An empty counts dict is rebuild_from_sqlite's documented signal
+        # for a validation refusal (missing source, existing dest,
+        # in-place without --archive-existing). The library already
+        # printed an actionable message; exit non-zero so unattended
+        # scripts/CI distinguish "invalid inputs" from a successful
+        # rebuild that legitimately found zero rows (which still returns
+        # a populated dict with 0-valued counts).
+        if not counts:
+            sys.exit(1)
         return
 
     db_path = os.path.join(palace_path, "chroma.sqlite3")
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 7e98f0f..90fb0cf 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -43,11 +43,46 @@
 
 COLLECTION_NAME = "mempalace_drawers"
 
-# Collections rebuilt by ``rebuild_from_sqlite``. Order matters for the
-# upsert pass — drawers carry the bulk of the data, closets are the AAAK
-# index layer and reference drawer IDs by string in their documents (no
-# foreign-key validation, so ordering is informational, not load-bearing).
-RECOVERABLE_COLLECTIONS = ("mempalace_drawers", "mempalace_closets")
+# The closets collection (AAAK index layer) is intentionally fixed —
+# closets reference drawer IDs by string and live alongside drawers in the
+# same palace; renaming the closets collection per-deployment would break
+# cross-palace AAAK lookups. Drawer collection name comes from config
+# (see ``_recoverable_collections``).
+CLOSETS_COLLECTION_NAME = "mempalace_closets"
+
+
+def _drawers_collection_name() -> str:
+    """Resolve the drawers collection name from user config, falling back
+    to the module default ``COLLECTION_NAME`` if config is unreadable.
+
+    Recovery flows must honor ``MempalaceConfig().collection_name`` so a
+    user with a non-default drawer collection (e.g. multi-palace setups)
+    rebuilds the right rows. Closets remain fixed — see
+    ``CLOSETS_COLLECTION_NAME``.
+    """
+    try:
+        from .config import MempalaceConfig
+
+        return MempalaceConfig().collection_name or COLLECTION_NAME
+    except Exception:
+        return COLLECTION_NAME
+
+
+def _recoverable_collections() -> tuple[str, ...]:
+    """Collections rebuilt by ``rebuild_from_sqlite``, in upsert order.
+
+    Drawers first (bulk data), then closets (AAAK index layer that
+    references drawer IDs by string in their documents — no
+    foreign-key validation, so ordering is informational, not
+    load-bearing).
+    """
+    return (_drawers_collection_name(), CLOSETS_COLLECTION_NAME)
+
+
+# Back-compat alias for callers that imported the constant. New code
+# should call ``_recoverable_collections()`` so config changes are picked
+# up at call time.
+RECOVERABLE_COLLECTIONS = (COLLECTION_NAME, CLOSETS_COLLECTION_NAME)
 
 
 def _get_palace_path():
@@ -487,12 +522,11 @@ def _rebuild_one_collection(
     caller can stop the loop and print recovery instructions instead of
     silently shipping a partial palace.
     """
-    col = backend.create_collection(dest_palace, collection_name)
-
     ids: list[str] = []
     docs: list[str] = []
     metas: list[dict] = []
     upserted = 0
+    col = None
 
     def _flush() -> int:
         nonlocal upserted
@@ -507,6 +541,14 @@ def _flush() -> int:
         return upserted
 
     try:
+        # ``create_collection`` lives inside the try so a Chroma-side
+        # "Collection already exists" failure (which can happen when the
+        # process-wide System cache still holds a pre-archive schema) is
+        # reported as a structured ``RebuildPartialError`` carrying
+        # ``archive_path`` — instead of an unstructured exception that
+        # strands the user without recovery instructions.
+        col = backend.create_collection(dest_palace, collection_name)
+
         for emb_id, doc, meta in extract_via_sqlite(source_palace, collection_name):
             ids.append(emb_id)
             docs.append(doc or "")
@@ -664,8 +706,14 @@ def rebuild_from_sqlite(
 
     Returns a ``{collection_name: row_count}`` dict so callers (CLI,
     tests) can verify the per-collection rebuild count without parsing
-    stdout. Returns ``{}`` on validation failures (missing source,
-    refusing to overwrite). Raises :class:`RebuildPartialError` if a
+    stdout. A successful rebuild always returns a dict with one key per
+    recoverable collection (values may be ``0`` when a collection is
+    legitimately empty in the source). The empty dict ``{}`` is reserved
+    for validation refusals (missing source DB, refusing to overwrite an
+    existing dest, in-place mode without ``archive_existing_dest``); CLI
+    callers should treat ``{}`` as an error and exit non-zero so CI and
+    scripts can distinguish "invalid inputs" from "successful recovery
+    that found zero rows." Raises :class:`RebuildPartialError` if a
     chromadb upsert fails partway through; the dest palace is left in
     place so the user can inspect what landed, and the in-place archive
     (when applicable) is reported in the error so the user can re-run
@@ -765,31 +813,42 @@ def rebuild_from_sqlite(
 
     os.makedirs(dest_palace, exist_ok=True)
 
+    # Backend lifetime is wrapped in try/finally so the dest palace's
+    # PersistentClient handle (opened lazily inside ``create_collection``
+    # / ``get_collection``) is released on every exit path: success,
+    # ``RebuildPartialError``, or any unexpected exception. Without this,
+    # a long-running process that calls ``rebuild_from_sqlite`` would
+    # leak SQLite/HNSW file handles into Chroma's ``SharedSystemClient``
+    # cache, surfacing later as "Collection already exists" on the next
+    # in-place rebuild or as a Windows file-lock failure on cleanup
+    # (cf. #1285's lifecycle hardening for the legacy rebuild path).
     backend = ChromaBackend()
     counts: dict[str, int] = {}
+    try:
+        for cname in _recoverable_collections():
+            print(f"\n  [{cname}]")
+            upserted = _rebuild_one_collection(
+                backend=backend,
+                source_palace=source_palace,
+                dest_palace=dest_palace,
+                collection_name=cname,
+                batch_size=batch_size,
+                archive_path=archive_path,
+                counts_so_far=counts,
+            )
+            counts[cname] = upserted
+            if upserted == 0:
+                print(f"    no rows found for {cname} in source palace")
+            else:
+                print(f"    done: {upserted} rows in {cname}")
 
-    for cname in RECOVERABLE_COLLECTIONS:
-        print(f"\n  [{cname}]")
-        upserted = _rebuild_one_collection(
-            backend=backend,
-            source_palace=source_palace,
-            dest_palace=dest_palace,
-            collection_name=cname,
-            batch_size=batch_size,
-            archive_path=archive_path,
-            counts_so_far=counts,
-        )
-        counts[cname] = upserted
-        if upserted == 0:
-            print(f"    no rows found for {cname} in source palace")
-        else:
-            print(f"    done: {upserted} rows in {cname}")
-
-    print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
-    if archive_path is not None:
-        print(f"  Original palace archived at: {archive_path}")
-    print(f"{'=' * 55}\n")
-    return counts
+        print(f"\n  Rebuild complete. {sum(counts.values())} total rows.")
+        if archive_path is not None:
+            print(f"  Original palace archived at: {archive_path}")
+        print(f"{'=' * 55}\n")
+        return counts
+    finally:
+        backend.close()
 
 
 def status(palace_path=None) -> dict:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6b4b7b3..71ca63d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1097,3 +1097,64 @@ def test_reconfigure_stdio_is_noop_off_windows():
         _reconfigure_stdio_utf8_on_windows()
 
     assert stdin.reconfigure_calls == []
+
+
+# ── cmd_repair: from-sqlite mode exit codes ──────────────────────────
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_validation_refusal_exits_nonzero(mock_config_cls, tmp_path, capsys):
+    """When ``rebuild_from_sqlite`` returns ``{}`` for a validation
+    refusal (missing source DB, in-place without --archive-existing,
+    refusing to overwrite an existing dest), the CLI must surface a
+    non-zero exit so unattended scripts and CI distinguish "invalid
+    inputs" from "successful recovery that found zero rows."
+
+    Catches: a regression where the CLI treats the validation-refusal
+    sentinel as success, leaving CI green on a no-op repair that should
+    have alerted an operator.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value={}):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_repair(args)
+    assert excinfo.value.code == 1
+
+
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_from_sqlite_success_does_not_exit(mock_config_cls, tmp_path):
+    """A successful from-sqlite rebuild — even one that finds zero rows
+    in a legitimately empty source palace — must NOT call ``sys.exit``.
+    A populated counts dict (with ``0`` values) is the success signal;
+    only the empty dict ``{}`` is reserved for validation refusal.
+
+    Catches: a regression where ``if not counts`` is replaced by
+    ``if not sum(counts.values())`` or similar, conflating "empty source"
+    with "validation refused" and breaking idempotent recovery scripts.
+    """
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+
+    args = argparse.Namespace(
+        palace=str(palace_dir),
+        mode="from-sqlite",
+        source=None,
+        archive_existing=False,
+        yes=True,
+    )
+    # Zero rows but per-collection keys present → success, no exit.
+    fake_counts = {"mempalace_drawers": 0, "mempalace_closets": 0}
+    with patch("mempalace.repair.rebuild_from_sqlite", return_value=fake_counts):
+        # Should return cleanly; no SystemExit raised.
+        cmd_repair(args)
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 35e6a44..8ca72fb 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -719,6 +719,13 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
     Catches: anyone who breaks the segments/embeddings/embedding_metadata
     JOIN, swaps the metadata vs vector segment, or changes how the
     document is stored under the ``chroma:document`` key.
+
+    Also asserts every embedding row underlying the extraction lives in
+    a ``segments.scope = 'METADATA'`` segment. Document + metadata rows
+    are stored under METADATA in Chroma's segment layout while HNSW
+    files live under ``VECTOR``; locking that assumption in here means a
+    future refactor that accidentally points the JOIN at ``VECTOR``
+    fails this test instead of silently regressing the recovery path.
     """
     rows = [
         (f"drawer_{i:03d}", f"document body {i}", {"wing": "test_wing", "room": f"r{i % 3}"})
@@ -736,6 +743,35 @@ def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):
         assert got_doc == doc, f"document mangled for {emb_id}"
         assert got_meta == meta, f"metadata mangled for {emb_id}: {got_meta!r}"
 
+    # Lock the segment-scope assumption directly against Chroma's on-disk
+    # layout so a future change that points the extraction JOIN at the
+    # VECTOR segment cannot pass this test. Query each extracted row's
+    # backing segment scope via the same SQLite tables ``extract_via_sqlite``
+    # reads from.
+    sqlite_path = os.path.join(str(tmp_path), "chroma.sqlite3")
+    conn = sqlite3.connect(f"file:{sqlite_path}?mode=ro", uri=True)
+    try:
+        scopes = {
+            scope
+            for (scope,) in conn.execute(
+                """
+                SELECT DISTINCT s.scope
+                FROM embeddings e
+                JOIN segments s ON e.segment_id = s.id
+                JOIN collections c ON s.collection = c.id
+                WHERE c.name = ? AND e.embedding_id IN ({})
+                """.format(",".join("?" * len(extracted))),
+                ("mempalace_drawers", *(emb_id for emb_id, _, _ in extracted)),
+            )
+        }
+    finally:
+        conn.close()
+    assert scopes == {"METADATA"}, (
+        f"extraction is reading from segments scoped {scopes!r}; only "
+        "'METADATA' should back the document/metadata rows. If Chroma's "
+        "segment layout changed, update extract_via_sqlite's WHERE clause."
+    )
+
 
 def test_extract_via_sqlite_preserves_typed_metadata(tmp_path):
     """Chromadb stores int / float / bool / string in distinct typed
@@ -973,3 +1009,56 @@ def boom(self, **kwargs):
     assert err.archive_path is not None
     assert os.path.isfile(os.path.join(err.archive_path, "chroma.sqlite3"))
     assert err.dest_palace == os.path.abspath(str(palace))
+
+
+def test_rebuild_from_sqlite_honors_configured_drawer_collection_name(tmp_path, monkeypatch):
+    """A user with a non-default drawers collection name (set via
+    ``MempalaceConfig().collection_name``) must have THAT collection
+    rebuilt — not the hardcoded ``mempalace_drawers``.
+
+    Catches: a regression where the recovery path silently rebuilds the
+    default-name collection on a custom-named palace, leaving the user's
+    actual data unrebuilt while reporting "rebuild complete." This is
+    the failure mode reviewer mjc flagged on PR #1310 as needing to line
+    up with the configured-collection-name work in #1312. Closets stay
+    fixed (``mempalace_closets``) by design — the AAAK index references
+    drawer IDs by string and is not per-deployment configurable.
+
+    Strategy: monkeypatch the lazy resolver so the test is hermetic and
+    does not depend on the global config file or env state.
+    """
+    from mempalace.backends.chroma import ChromaBackend
+
+    custom_drawers = "custom_drawers_xyz"
+    monkeypatch.setattr(repair, "_drawers_collection_name", lambda: custom_drawers)
+
+    source = tmp_path / "source"
+    dest = tmp_path / "dest"
+
+    drawer_rows = [(f"d{i}", f"body {i}", {"wing": "alpha"}) for i in range(3)]
+    closet_rows = [("closet_a", "abbrev →d0", {"wing": "alpha"})]
+    _seed_palace(source, custom_drawers, drawer_rows)
+    _seed_palace(source, "mempalace_closets", closet_rows)
+
+    counts = repair.rebuild_from_sqlite(str(source), str(dest))
+
+    # Rebuilt under the custom name, not under the default "mempalace_drawers".
+    assert counts == {custom_drawers: 3, "mempalace_closets": 1}
+
+    backend = ChromaBackend()
+    rebuilt_drawers = backend.get_collection(str(dest), custom_drawers)
+    assert rebuilt_drawers.count() == 3
+
+    # Default-name collection must NOT exist in dest — proves we did not
+    # silently fall back to the hardcoded name during rebuild.
+    try:
+        rebuilt_default = backend.get_collection(str(dest), "mempalace_drawers")
+        # If get_collection returns without raising, count() should be 0
+        # (chromadb may auto-create on get with some EFs); a non-zero
+        # count would mean we wrote rows to the wrong collection.
+        assert rebuilt_default.count() == 0, (
+            "rebuild leaked rows into the default-name collection on a "
+            "custom-name palace — recovery wrote to the wrong collection."
+        )
+    except Exception:
+        pass  # Expected: collection wasn't created.

From e334e257bf4bb634cf4f31b4087f84f8b467938b Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Wed, 6 May 2026 04:52:18 -0300
Subject: [PATCH 076/127] fix(mcp): retry _get_collection once on transient
 failure (#1286)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A transient chromadb exception inside `_get_collection` was swallowed by
the bare `except Exception: return None`, leaving every subsequent tool
call hitting the same poisoned cache silently. The fix wraps the body
in a `for attempt in range(2)` loop: on attempt 0 failure, log via
`logger.exception(...)` and clear `_client_cache` / `_collection_cache`
/ `_metadata_cache` so the next iteration forces `_get_client()` to
rebuild from scratch — that path now re-runs `quarantine_stale_hnsw`
(per #1322), so the second attempt heals the common stale-handle case
automatically. If both attempts fail, return `None` (matches the prior
contract for permanent failures).

Two new tests in `tests/test_mcp_server.py::TestCacheInvalidation`:
- `test_get_collection_retries_once_on_exception` — first attempt raises
  via a monkeypatched `_get_client`, second attempt succeeds; assert the
  caller gets the collection back, not None.
- `test_get_collection_returns_none_after_two_failures` — both attempts
  fail, assert we exhaust the loop and return None (no infinite retry).

Surgical extraction from PR #1286, which carried the same fix idea
(plus a fork-sync bundle that couldn't be merged); credit to the
original author below.

Co-authored-by: Jeffrey Hein <jp@jphein.com>
---
 mempalace/mcp_server.py  | 148 +++++++++++++++++++++++----------------
 tests/test_mcp_server.py |  65 +++++++++++++++++
 2 files changed, 152 insertions(+), 61 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 58f9ba9..bbb9c93 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -326,68 +326,94 @@ def _get_client():
 
 
 def _get_collection(create=False):
-    """Return the ChromaDB collection, caching the client between calls."""
-    global _collection_cache, _metadata_cache, _metadata_cache_time
-    try:
-        client = _get_client()
-        # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
-        # collection but not the EF *instance/configuration*. So a reader or
-        # writer that omits ``embedding_function=`` silently gets chromadb's
-        # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
-        # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
-        # the identity check passes), but the *provider list* is chromadb's
-        # default rather than the user's resolved device. On bleeding-edge
-        # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
-        # that default provider selection can SIGSEGV the host process on
-        # first ``col.add()``. The miner / Stop hook ingest path avoids this
-        # because it routes through ``ChromaBackend.get_collection``, which
-        # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
-        # the MCP server bypassed that abstraction. Resolve the EF inside the
-        # branches that actually open a collection so warm-cache reads stay
-        # zero-cost. Reuse the backend helper so the two call sites can't
-        # drift on logging or fallback semantics.
-        if create:
-            ef = ChromaBackend._resolve_embedding_function()
-            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
-            # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
-            # HNSW insert path, which has a race in repairConnectionsForUpdate /
-            # addPoint (see issues #974, #965). Set via metadata on fresh
-            # collections and re-applied via _pin_hnsw_threads() for legacy
-            # palaces whose collections were created before this fix (the
-            # runtime config does not persist cross-process in chromadb 1.5.x,
-            # so the retrofit runs every time _get_collection opens a cache).
-            #
-            # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
-            # is called with metadata that differs from what's stored. The split
-            # below skips the metadata-comparison codepath for existing
-            # collections, mirroring the backend-layer fix from #1262.
-            try:
+    """Return the ChromaDB collection, caching the client between calls.
+
+    On failure, log the exception and retry once after clearing the client
+    and collection caches. Tools were silently returning ``None`` when a
+    cached client/collection went stale — typically after the chromadb
+    rust bindings invalidated a handle following an out-of-band write —
+    leaving the LLM with no diagnostic and no recovery path. The retry
+    forces ``_get_client()`` to rebuild from scratch (which re-runs
+    ``quarantine_stale_hnsw`` per #1322), so the second attempt heals the
+    common stale-handle / stale-HNSW case automatically.
+    """
+    global _client_cache, _collection_cache, _metadata_cache, _metadata_cache_time
+    for attempt in range(2):
+        try:
+            client = _get_client()
+            # ChromaDB 1.x persists the EF *identity* (its ``name()``) with the
+            # collection but not the EF *instance/configuration*. So a reader or
+            # writer that omits ``embedding_function=`` silently gets chromadb's
+            # built-in ``DefaultEmbeddingFunction`` — its ``name()`` matches the
+            # one we spoof in ``mempalace.embedding`` (both report ``"default"``,
+            # the identity check passes), but the *provider list* is chromadb's
+            # default rather than the user's resolved device. On bleeding-edge
+            # interpreters (#1299: python 3.14 + chromadb 1.5.x on Apple Silicon)
+            # that default provider selection can SIGSEGV the host process on
+            # first ``col.add()``. The miner / Stop hook ingest path avoids this
+            # because it routes through ``ChromaBackend.get_collection``, which
+            # resolves the EF via ``ChromaBackend._resolve_embedding_function``;
+            # the MCP server bypassed that abstraction. Resolve the EF inside the
+            # branches that actually open a collection so warm-cache reads stay
+            # zero-cost. Reuse the backend helper so the two call sites can't
+            # drift on logging or fallback semantics.
+            if create:
+                ef = ChromaBackend._resolve_embedding_function()
+                ef_kwargs = {"embedding_function": ef} if ef is not None else {}
+                # hnsw:num_threads=1 disables ChromaDB's multi-threaded ParallelFor
+                # HNSW insert path, which has a race in repairConnectionsForUpdate /
+                # addPoint (see issues #974, #965). Set via metadata on fresh
+                # collections and re-applied via _pin_hnsw_threads() for legacy
+                # palaces whose collections were created before this fix (the
+                # runtime config does not persist cross-process in chromadb 1.5.x,
+                # so the retrofit runs every time _get_collection opens a cache).
+                #
+                # ChromaDB 1.5.x's Rust binding SIGSEGVs when get_or_create_collection
+                # is called with metadata that differs from what's stored. The split
+                # below skips the metadata-comparison codepath for existing
+                # collections, mirroring the backend-layer fix from #1262.
+                try:
+                    raw = client.get_collection(_config.collection_name, **ef_kwargs)
+                except _ChromaNotFoundError:
+                    raw = client.create_collection(
+                        _config.collection_name,
+                        metadata={
+                            "hnsw:space": "cosine",
+                            "hnsw:num_threads": 1,
+                            **_HNSW_BLOAT_GUARD,
+                        },
+                        **ef_kwargs,
+                    )
+                _pin_hnsw_threads(raw)
+                _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
+                _metadata_cache = None
+                _metadata_cache_time = 0
+            elif _collection_cache is None:
+                ef = ChromaBackend._resolve_embedding_function()
+                ef_kwargs = {"embedding_function": ef} if ef is not None else {}
                 raw = client.get_collection(_config.collection_name, **ef_kwargs)
-            except _ChromaNotFoundError:
-                raw = client.create_collection(
-                    _config.collection_name,
-                    metadata={
-                        "hnsw:space": "cosine",
-                        "hnsw:num_threads": 1,
-                        **_HNSW_BLOAT_GUARD,
-                    },
-                    **ef_kwargs,
-                )
-            _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
-            _metadata_cache = None
-            _metadata_cache_time = 0
-        elif _collection_cache is None:
-            ef = ChromaBackend._resolve_embedding_function()
-            ef_kwargs = {"embedding_function": ef} if ef is not None else {}
-            raw = client.get_collection(_config.collection_name, **ef_kwargs)
-            _pin_hnsw_threads(raw)
-            _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
-            _metadata_cache = None
-            _metadata_cache_time = 0
-        return _collection_cache
-    except Exception:
-        return None
+                _pin_hnsw_threads(raw)
+                _collection_cache = ChromaCollection(raw, palace_path=_config.palace_path)
+                _metadata_cache = None
+                _metadata_cache_time = 0
+            return _collection_cache
+        except Exception:
+            logger.exception(
+                "_get_collection attempt %d/2 failed (palace=%s, create=%s)",
+                attempt + 1,
+                _config.palace_path,
+                create,
+            )
+            if attempt == 0:
+                # Reset all caches so the next attempt forces _get_client()
+                # to rebuild the chromadb client from scratch — that path
+                # re-runs quarantine_stale_hnsw (#1322) and reopens the
+                # collection cleanly, healing the common stale-handle case.
+                _client_cache = None
+                _collection_cache = None
+                _metadata_cache = None
+                _metadata_cache_time = 0
+    return None
 
 
 def _no_palace():
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index c073830..ae20bf3 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1259,6 +1259,71 @@ def _spy_create(self, name, **kwargs):
             assert "embedding_function" in kwargs
             assert kwargs["embedding_function"] is not None
 
+    def test_get_collection_retries_once_on_exception(self, monkeypatch, config, palace_path, kg):
+        """Regression: a transient failure inside _get_collection must trigger
+        one retry after clearing the client/collection caches, not silently
+        return None.
+
+        Before this fix, a stale chromadb handle (e.g. the rust bindings
+        invalidating after an out-of-band write) would raise inside the
+        single ``try`` block, get swallowed by ``except Exception: return
+        None``, and every subsequent tool call would hit the same poisoned
+        cache returning None. The retry forces ``_get_client()`` to rebuild
+        the client (which re-runs ``quarantine_stale_hnsw`` per #1322), so
+        the second attempt heals the common stale-handle case.
+        """
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        # Force a cold cache so the first call goes through the open path.
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        real_get_client = mcp_server._get_client
+        attempts = {"count": 0}
+
+        def flaky_get_client():
+            attempts["count"] += 1
+            if attempts["count"] == 1:
+                raise RuntimeError("simulated transient chromadb failure")
+            return real_get_client()
+
+        monkeypatch.setattr(mcp_server, "_get_client", flaky_get_client)
+
+        col = mcp_server._get_collection()
+
+        # Both attempts ran and the second succeeded.
+        assert attempts["count"] == 2
+        assert col is not None
+
+    def test_get_collection_returns_none_after_two_failures(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        """If both attempts fail, return None (matches the prior contract for
+        permanent failures — only the transient case is now self-healing)."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        _client, _col = _get_collection(palace_path, create=True)
+        del _client
+        from mempalace import mcp_server
+
+        mcp_server._client_cache = None
+        mcp_server._collection_cache = None
+
+        attempts = {"count": 0}
+
+        def always_fails():
+            attempts["count"] += 1
+            raise RuntimeError("permanent chromadb failure")
+
+        monkeypatch.setattr(mcp_server, "_get_client", always_fails)
+
+        col = mcp_server._get_collection()
+
+        assert attempts["count"] == 2
+        assert col is None
+
 
 class TestKGLazyCache:
     """Lazy per-path KnowledgeGraph cache (issue #1136)."""

From e28ac9460303cdc8dd027a8a4a3dd7fe9b62eaea Mon Sep 17 00:00:00 2001
From: sjhddh <jhao.sun@gmail.com>
Date: Wed, 6 May 2026 13:22:11 +0000
Subject: [PATCH 077/127] docs: clarify contributor git identity setup

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 CONTRIBUTING.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9c6501d..fed0160 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -93,3 +93,15 @@ If you're planning a significant change, open an issue first to discuss the appr
 ## License
 
 MIT — your contributions will be released under the same license.
+
+## Git identity for contributions
+
+Before pushing commits, verify that Git is configured with an email address that GitHub can associate with your account:
+
+```bash
+git config user.name
+git config user.email
+```
+
+This is especially important when commits are created through agentic coding tools or automation, because those tools may not inherit your normal shell Git configuration. Avoid placeholder values such as `your@email.com` or localized template text; unresolved author emails can create avoidable provenance and SBOM review friction for downstream users.
+

From bddba59ae3ffe1fab6a281b7090f9945287a98d6 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 12:35:01 -0700
Subject: [PATCH 078/127] docs: add 30-day expiry callout + ship 4 auto-save
 tools
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a brief [!IMPORTANT] callout at the top of the README pointing
users to the urgent announcement at #1388. Claude Code auto-deletes
local JSONL transcripts after 30 days; users without the auto-save
hooks wired are losing transcript data off the rolling window.

Ships 4 small standalone tools at tools/:
- backup_claude_jsonls.sh — rsync ~/.claude/projects/ to a safe folder
- render_jsonl.py — convert JSONL transcripts to readable text
- find_orphan_claude_jsonls.sh — scan backup locations for orphan
  Claude Code transcripts (multi-line shape detection + topic preview)
- save.md — Claude Code slash command for manual /save into MemPalace

Tools verified by independent agent against v3.3.4 source.
Read-only on user data. POSIX bash + Python stdlib only.
---
 README.md                          |   4 +
 tools/backup_claude_jsonls.sh      |  39 ++++++++++
 tools/find_orphan_claude_jsonls.sh | 115 +++++++++++++++++++++++++++++
 tools/render_jsonl.py              |  71 ++++++++++++++++++
 tools/save.md                      |  26 +++++++
 5 files changed, 255 insertions(+)
 create mode 100755 tools/backup_claude_jsonls.sh
 create mode 100755 tools/find_orphan_claude_jsonls.sh
 create mode 100755 tools/render_jsonl.py
 create mode 100644 tools/save.md

diff --git a/README.md b/README.md
index 8157fca..d82bcd2 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,10 @@
 > domain — including `mempalace.tech` — is an impostor and may distribute
 > malware. Details and timeline: [docs/HISTORY.md](docs/HISTORY.md).
 
+> [!IMPORTANT]
+> **🚨 Claude Code sessions expire in 30 days w/out auto-save hooks wired!** **[Read this →](https://github.com/MemPalace/mempalace/discussions/1388)**
+
+
 <div align="center">
 
 <img src="assets/mempalace_logo.png" alt="MemPalace" width="240">
diff --git a/tools/backup_claude_jsonls.sh b/tools/backup_claude_jsonls.sh
new file mode 100755
index 0000000..f252de0
--- /dev/null
+++ b/tools/backup_claude_jsonls.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# backup_claude_jsonls.sh
+#
+# Claude Code stores every conversation as a JSONL transcript at
+#   ~/.claude/projects/<encoded-project>/<session-uuid>.jsonl
+# Anthropic auto-deletes those files after 30 DAYS:
+#   https://docs.claude.com/en/docs/claude-code/data-usage
+#
+# This script copies them, read-only, into ~/Documents/Claude_JSONL_Backup/
+# so the 30-day clock no longer applies. Re-run any time — rsync is incremental.
+# It NEVER deletes, modifies, or touches files inside ~/.claude/.
+
+set -eu
+
+SRC="${HOME}/.claude/projects/"
+DST="${HOME}/Documents/Claude_JSONL_Backup/"
+
+[ -d "$SRC" ] || { echo "ERROR: $SRC does not exist."; exit 1; }
+mkdir -p "$DST"
+
+echo "Backing up $SRC -> $DST"
+rsync -a --times "$SRC" "$DST"
+
+src_count=$(find "$SRC" -type f -name '*.jsonl' | wc -l | tr -d ' ')
+dst_count=$(find "$DST" -type f -name '*.jsonl' | wc -l | tr -d ' ')
+oldest=$(find "$DST" -type f -name '*.jsonl' -exec stat -f '%Sm %N' -t '%Y-%m-%d' {} \; 2>/dev/null \
+        || find "$DST" -type f -name '*.jsonl' -printf '%TY-%Tm-%Td %p\n' 2>/dev/null)
+oldest_date=$(echo "$oldest" | sort | head -n 1 | awk '{print $1}')
+newest_date=$(echo "$oldest" | sort | tail -n 1 | awk '{print $1}')
+
+echo "Source JSONL count : $src_count"
+echo "Backup JSONL count : $dst_count"
+echo "Oldest backup file : ${oldest_date:-n/a}"
+echo "Newest backup file : ${newest_date:-n/a}"
+
+if [ "$src_count" -ne "$dst_count" ]; then
+  echo "FAIL: count mismatch ($src_count vs $dst_count)"; exit 2
+fi
+echo "OK: backup verified."
diff --git a/tools/find_orphan_claude_jsonls.sh b/tools/find_orphan_claude_jsonls.sh
new file mode 100755
index 0000000..43523f5
--- /dev/null
+++ b/tools/find_orphan_claude_jsonls.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# find_orphan_claude_jsonls.sh — v3 (multi-line shape + verb-aware preview)
+# -----------------------------------------------------------------------------
+# Finds Claude Code conversation transcripts (.jsonl) that may have survived in
+# backup/sync locations. Claude Code stores transcripts at
+# ~/.claude/projects/<encoded>/<session>.jsonl and auto-deletes them locally
+# after 30 days. If your machine syncs to iCloud, Dropbox, Google Drive,
+# OneDrive, Time Machine, or you copied transcripts elsewhere manually, those
+# copies still exist. This script finds them and shows a topic preview from
+# the first substantive user message — strips leading filler interjections
+# ("ok so", "oh", "well", "hey") so previews surface the actual content.
+#
+# Read-only. Safe to re-run.
+# -----------------------------------------------------------------------------
+set -eu
+
+LOCATIONS=(
+  "$HOME/Library/Mobile Documents" "$HOME/Dropbox" "$HOME/Google Drive"
+  "$HOME/OneDrive" "$HOME/Documents" "$HOME/Desktop" "/Volumes"
+)
+
+TMP="$(mktemp)"; trap 'rm -f "$TMP" "$TMP.s"' EXIT
+
+printf "Scanning backup locations" >&2
+for loc in "${LOCATIONS[@]}"; do
+  [ -d "$loc" ] || continue
+  printf "." >&2
+  while IFS= read -r -d '' f; do
+    # Combined: shape detection (multi-line) + verb-aware topic preview
+    if preview="$(python3 - "$f" 2>/dev/null <<'PYEOF'
+import json, sys, re
+
+# Single-word/short greetings — message gets skipped entirely if it is just one of these
+GREETINGS = {'hi','hey','hello','thanks','thank you','ok','okay','yes','no',
+             'sure','cool','great','good','done','yep','nope','perfect','copy'}
+
+# Leading filler — interjections that get STRIPPED from the start of a message
+# before the preview is taken. Iterative — handles "ok so well, then..." → "then..."
+LEADING_FILLER = re.compile(
+    r'^(?:ok(?:ay)?|so|oh|well|anyway|btw|hmm+|um+|uh+|hey|hi|hello|right|'
+    r'yes|no|sure|cool|great|good|listen|look|wait|actually|alright|gotcha|'
+    r'yeah|yep|nope|nah)\b[\s,!.?:;-]*',
+    re.IGNORECASE
+)
+
+path = sys.argv[1]
+shape_ok = False
+preview = ""
+try:
+    with open(path, 'r', errors='replace') as fh:
+        for i, line in enumerate(fh):
+            if i >= 30: break
+            try:
+                d = json.loads(line)
+            except Exception:
+                continue
+            if not isinstance(d, dict): continue
+            # Shape check — accept if any line in first 30 has session fields
+            if not shape_ok and 'sessionId' in d and 'timestamp' in d and 'message' in d:
+                shape_ok = True
+            # Preview — first user message after stripping leading filler
+            if not preview:
+                role = d.get('type', '') or d.get('message', {}).get('role', '')
+                if role == 'user':
+                    content = d.get('message', {}).get('content', '')
+                    if isinstance(content, list):
+                        text = ' '.join(
+                            c.get('text', '') for c in content
+                            if isinstance(c, dict) and c.get('type') == 'text'
+                        )
+                    elif isinstance(content, str):
+                        text = content
+                    else:
+                        text = ''
+                    text = re.sub(r'\s+', ' ', text).strip()
+                    # Skip messages that are pure greetings
+                    if text.lower() in GREETINGS:
+                        continue
+                    # Iteratively strip leading filler tokens until stable
+                    prev_text = None
+                    while prev_text != text:
+                        prev_text = text
+                        text = LEADING_FILLER.sub('', text).strip()
+                    # Skip if what remains is too short
+                    if len(text) < 20:
+                        continue
+                    preview = text[:80] + ('...' if len(text) > 80 else '')
+            if shape_ok and preview: break
+except Exception:
+    pass
+if shape_ok:
+    print(preview if preview else "(no preview — first 30 lines were greetings or short)")
+    sys.exit(0)
+sys.exit(1)
+PYEOF
+)"; then
+      mtime="$(stat -f '%Sm' -t '%Y-%m-%d' "$f" 2>/dev/null || stat -c '%y' "$f" 2>/dev/null | cut -d' ' -f1)"
+      size="$(stat -f '%z' "$f" 2>/dev/null || stat -c '%s' "$f" 2>/dev/null)"
+      printf '%s\t%s\t%s\t%s\n' "$mtime" "$size" "$f" "$preview" >>"$TMP"
+    fi
+  done < <(find "$loc" -type f -name '*.jsonl' -print0 2>/dev/null)
+done
+printf "\n" >&2
+
+count=$(wc -l <"$TMP" | tr -d ' ')
+if [ "$count" -eq 0 ]; then
+  echo "No orphan Claude Code transcripts found in scanned backup locations."
+  exit 0
+fi
+sort -k1,1 "$TMP" >"$TMP.s"
+oldest="$(head -n 1 "$TMP.s" | cut -f1)"
+newest="$(tail -n 1 "$TMP.s" | cut -f1)"
+echo "Found $count orphan Claude Code transcript(s). Oldest: $oldest  Newest: $newest"
+echo "----------------------------------------------------------------------"
+awk -F'\t' '{ printf "%s  %10s  %s\n              \"%s\"\n\n", $1, $2, $3, $4 }' "$TMP.s"
diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
new file mode 100755
index 0000000..3d74c00
--- /dev/null
+++ b/tools/render_jsonl.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""render_jsonl.py — turn one Claude Code JSONL transcript into readable text.
+
+Claude Code stores conversations at ~/.claude/projects/<proj>/<uuid>.jsonl and
+Anthropic auto-deletes them after 30 days
+(https://docs.claude.com/en/docs/claude-code/data-usage). This script renders a
+JSONL into a clean .txt so you can keep / read / share it without the tooling.
+
+Usage:
+    python3 render_jsonl.py <input.jsonl> [output.txt]
+
+Stdlib only. Python 3.9+. Read-only on the input.
+"""
+import json, sys
+from pathlib import Path
+
+def extract_text(content):
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        parts = []
+        for blk in content:
+            if isinstance(blk, dict) and blk.get("type") == "text":
+                t = (blk.get("text") or "").strip()
+                if t:
+                    parts.append(t)
+        return "\n".join(parts)
+    return ""
+
+def main():
+    if len(sys.argv) < 2:
+        print(__doc__); sys.exit(1)
+    src = Path(sys.argv[1])
+    if not src.is_file():
+        print(f"ERROR: not a file: {src}"); sys.exit(1)
+    out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout
+
+    turns, stamps = [], []
+    for raw in src.read_text(encoding="utf-8", errors="replace").splitlines():
+        if not raw.strip():
+            continue
+        try:
+            obj = json.loads(raw)
+        except json.JSONDecodeError:
+            continue
+        role = obj.get("type") or (obj.get("message") or {}).get("role")
+        if role not in ("user", "assistant"):
+            continue
+        msg = obj.get("message") or obj
+        text = extract_text(msg.get("content"))
+        if not text:
+            continue
+        ts = obj.get("timestamp") or ""
+        if ts: stamps.append(ts)
+        turns.append((ts, role, text))
+
+    header = [
+        f"# Claude Code transcript: {src}",
+        f"# Total turns: {len(turns)}",
+        f"# Date range : {min(stamps) if stamps else 'n/a'}  ->  {max(stamps) if stamps else 'n/a'}",
+        "#" + "-" * 70, "",
+    ]
+    out.write("\n".join(header))
+    for ts, role, text in turns:
+        out.write(f"\n[{ts}] {role.upper()}\n{text}\n\n{'-'*72}\n")
+    if out is not sys.stdout:
+        out.close()
+        print(f"Wrote {len(turns)} turns to {sys.argv[2]}")
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/save.md b/tools/save.md
new file mode 100644
index 0000000..914156b
--- /dev/null
+++ b/tools/save.md
@@ -0,0 +1,26 @@
+---
+description: Save the current Claude Code session into MemPalace. Idempotent — won't dupe.
+---
+
+# /save
+
+Save the current Claude Code session into MemPalace. Run this when you
+want a checkpoint. Safe to run repeatedly — drawer IDs are content-hashed
+so re-running on the same session overwrites in place, no duplicates.
+
+Behavior:
+
+1. Find the current session's JSONL transcript path (Claude Code passes
+   it via the conversation context — look for `~/.claude/projects/` paths).
+2. Run via bash:
+
+   ```
+   mempalace mine "<TRANSCRIPT_PATH>" --mode convos --wing claude_imports
+   ```
+
+3. If the user supplied an argument after `/save`, use it as the wing name
+   instead of `claude_imports` (e.g. `/save my_research` →
+   `--wing my_research`).
+4. Report back: how many drawers were filed, into which wing/room.
+
+Requires `mempalace` to be installed (`pip install mempalace`).

From 921ff5a6faf753130ee5b6e9666daf6eec2bfc65 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 15:39:08 -0700
Subject: [PATCH 079/127] fix(tools/render_jsonl): split chained statements per
 ruff 0.4.x

Addresses CI lint feedback on PR #1391. No behavior change.
- Split `import json, sys` into separate lines (E401)
- Split chained `print(...); sys.exit(1)` into two lines (E702, two occurrences)
- Split inline `if ts: stamps.append(ts)` into two lines (E701)

Verified: `ruff check tools/render_jsonl.py` reports "All checks passed!"
Tool still renders correctly (3 turns from a real JSONL test, identical output to pre-fix).
---
 tools/render_jsonl.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
index 3d74c00..3372ee1 100755
--- a/tools/render_jsonl.py
+++ b/tools/render_jsonl.py
@@ -11,7 +11,8 @@
 
 Stdlib only. Python 3.9+. Read-only on the input.
 """
-import json, sys
+import json
+import sys
 from pathlib import Path
 
 def extract_text(content):
@@ -29,10 +30,12 @@ def extract_text(content):
 
 def main():
     if len(sys.argv) < 2:
-        print(__doc__); sys.exit(1)
+        print(__doc__)
+        sys.exit(1)
     src = Path(sys.argv[1])
     if not src.is_file():
-        print(f"ERROR: not a file: {src}"); sys.exit(1)
+        print(f"ERROR: not a file: {src}")
+        sys.exit(1)
     out = open(sys.argv[2], "w", encoding="utf-8") if len(sys.argv) > 2 else sys.stdout
 
     turns, stamps = [], []
@@ -51,7 +54,8 @@ def main():
         if not text:
             continue
         ts = obj.get("timestamp") or ""
-        if ts: stamps.append(ts)
+        if ts:
+            stamps.append(ts)
         turns.append((ts, role, text))
 
     header = [

From 7c679ba6250fd8cc57af24a22ce9e19b67b20429 Mon Sep 17 00:00:00 2001
From: MillaJ <232237854+milla-jovovich@users.noreply.github.com>
Date: Wed, 6 May 2026 16:12:34 -0700
Subject: [PATCH 080/127] fix(tools/render_jsonl): apply ruff format
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Earlier commit fixed ruff lint but missed the formatter check.
This applies `ruff format` — adds standard PEP8 blank lines between
functions, splits one inline list. No behavior change.

Verified: both `ruff format --check` and `ruff check` pass cleanly.
Tool still renders correctly.
---
 tools/render_jsonl.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/render_jsonl.py b/tools/render_jsonl.py
index 3372ee1..2bec0da 100755
--- a/tools/render_jsonl.py
+++ b/tools/render_jsonl.py
@@ -11,10 +11,12 @@
 
 Stdlib only. Python 3.9+. Read-only on the input.
 """
+
 import json
 import sys
 from pathlib import Path
 
+
 def extract_text(content):
     if isinstance(content, str):
         return content.strip()
@@ -28,6 +30,7 @@ def extract_text(content):
         return "\n".join(parts)
     return ""
 
+
 def main():
     if len(sys.argv) < 2:
         print(__doc__)
@@ -62,7 +65,8 @@ def main():
         f"# Claude Code transcript: {src}",
         f"# Total turns: {len(turns)}",
         f"# Date range : {min(stamps) if stamps else 'n/a'}  ->  {max(stamps) if stamps else 'n/a'}",
-        "#" + "-" * 70, "",
+        "#" + "-" * 70,
+        "",
     ]
     out.write("\n".join(header))
     for ts, role, text in turns:
@@ -71,5 +75,6 @@ def main():
         out.close()
         print(f"Wrote {len(turns)} turns to {sys.argv[2]}")
 
+
 if __name__ == "__main__":
     main()

From be95ea7f3308f59b16d9db636b67f90f00a1901f Mon Sep 17 00:00:00 2001
From: JP de Boer knt <jpdeboer@atlastechrin.com>
Date: Thu, 7 May 2026 02:11:07 +0200
Subject: [PATCH 081/127] fix(mcp): retry tool_search once on Chroma "Error
 finding id" transient (#1315)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After a bulk CLI mine, ChromaDB's HNSW segment metadata can be unflushed
  for ~30-60s. Wing-scoped MCP search hits "Internal error: Error finding id"
  during that window, and the existing inode/mtime cache invalidation isn't
  enough — tool_search routes via search_memories -> palace.get_collection
  -> _DEFAULT_BACKEND._client, which has its own per-palace cache.

  This wraps tool_search with a single retry that drops both the MCP-local
  cache and _DEFAULT_BACKEND._clients/_freshness for the palace, sleeps 2s,
  retries once, and tags successful retries with index_recovered=True.

  Does not address tool_check_duplicate or other index-touching tools, nor
  the underlying flush window — options 1-3 from #1315 (auto-flush after
  mine, fail-fast detection, SQLite-only fallback) are still on the table
  for a complete fix.

  Refs #1315
---
 mempalace/mcp_server.py  | 58 ++++++++++++++++++++++++++++
 tests/test_mcp_server.py | 82 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index bbb9c93..b01750b 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -168,6 +168,46 @@ def _call_kg(op):
 _palace_db_inode = 0  # inode of chroma.sqlite3 at cache time
 _palace_db_mtime = 0.0  # mtime of chroma.sqlite3 at cache time
 
+
+def _is_transient_index_error(result) -> bool:
+    # Chroma can return "Internal error: Error finding id" during the
+    # HNSW flush window after a bulk CLI mine — SQLite rows are
+    # committed but the binary segment metadata isn't flushed yet.
+    # Self-heals once the flush completes (~30-60s). See issue #1315.
+    if not isinstance(result, dict):
+        return False
+    err = result.get("error", "")
+    return isinstance(err, str) and ("Error finding id" in err or "Internal error" in err)
+
+
+def _force_chroma_cache_reset() -> None:
+    # Drop both the MCP-local client cache and the shared backend's
+    # per-palace cache so the next call rebuilds against the post-flush
+    # state. Without clearing _DEFAULT_BACKEND._clients the retry
+    # would just hit the same stale handle, since tool_search routes
+    # via search_memories -> palace.get_collection -> backend cache.
+    global \
+        _client_cache, \
+        _collection_cache, \
+        _palace_db_inode, \
+        _palace_db_mtime, \
+        _metadata_cache, \
+        _metadata_cache_time
+    _client_cache = None
+    _collection_cache = None
+    _palace_db_inode = 0
+    _palace_db_mtime = 0.0
+    _metadata_cache = None
+    _metadata_cache_time = 0
+    try:
+        from .palace import _DEFAULT_BACKEND
+
+        _DEFAULT_BACKEND._clients.pop(_config.palace_path, None)
+        _DEFAULT_BACKEND._freshness.pop(_config.palace_path, None)
+    except Exception:
+        pass
+
+
 # ── Vector-search disabled flag (#1222) ──────────────────────────────────
 # Set when ``hnsw_capacity_status`` reports a divergence between sqlite
 # and the HNSW segment large enough that chromadb would segfault on
@@ -721,6 +761,24 @@ def tool_search(
         max_distance=dist,
         vector_disabled=_vector_disabled,
     )
+    if _is_transient_index_error(result):
+        # Post-bulk-write HNSW flush window (#1315): drop caches, give
+        # the segment a moment to settle, retry once. Caller never sees
+        # the transient unless the second attempt also fails.
+        _force_chroma_cache_reset()
+        time.sleep(2)
+        _refresh_vector_disabled_flag()
+        result = search_memories(
+            sanitized["clean_query"],
+            palace_path=_config.palace_path,
+            wing=wing,
+            room=room,
+            n_results=limit,
+            max_distance=dist,
+            vector_disabled=_vector_disabled,
+        )
+        if not _is_transient_index_error(result):
+            result["index_recovered"] = True
     if _vector_disabled:
         result["vector_disabled"] = True
         result["vector_disabled_reason"] = _vector_disabled_reason
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index ae20bf3..95b173e 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -414,6 +414,76 @@ def test_search_rejects_invalid_room(self, monkeypatch, config, kg):
         result = mcp_server.tool_search(query="JWT", room="../backend")
         assert "error" in result
 
+    def test_search_retries_once_on_hnsw_flush_transient(self, monkeypatch, config, kg):
+        """Issue #1315: post-bulk-mine 'Error finding id' is retried once."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        calls = {"n": 0}
+        reset_calls = {"n": 0}
+
+        def fake_search(*args, **kwargs):
+            calls["n"] += 1
+            if calls["n"] == 1:
+                return {
+                    "error": "Search error: Error executing plan: Internal error: Error finding id"
+                }
+            return {"results": [{"text": "ok", "wing": "w", "room": "r"}]}
+
+        def fake_reset():
+            reset_calls["n"] += 1
+
+        monkeypatch.setattr(mcp_server, "search_memories", fake_search)
+        monkeypatch.setattr(mcp_server, "_force_chroma_cache_reset", fake_reset)
+        monkeypatch.setattr(mcp_server.time, "sleep", lambda _: None)
+
+        result = mcp_server.tool_search(query="anything")
+
+        assert calls["n"] == 2
+        assert reset_calls["n"] == 1
+        assert "results" in result
+        assert result.get("index_recovered") is True
+
+    def test_search_does_not_retry_on_non_transient_error(self, monkeypatch, config, kg):
+        """Validation / unrelated errors must not trigger the retry path."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        calls = {"n": 0}
+
+        def fake_search(*args, **kwargs):
+            calls["n"] += 1
+            return {"error": "Search error: invalid query syntax"}
+
+        monkeypatch.setattr(mcp_server, "search_memories", fake_search)
+
+        result = mcp_server.tool_search(query="anything")
+
+        assert calls["n"] == 1
+        assert "error" in result
+        assert "index_recovered" not in result
+
+    def test_search_returns_second_error_if_retry_also_fails(self, monkeypatch, config, kg):
+        """If the transient persists past the retry, surface the second error."""
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        calls = {"n": 0}
+
+        def fake_search(*args, **kwargs):
+            calls["n"] += 1
+            return {"error": "Search error: Error executing plan: Internal error: Error finding id"}
+
+        monkeypatch.setattr(mcp_server, "search_memories", fake_search)
+        monkeypatch.setattr(mcp_server, "_force_chroma_cache_reset", lambda: None)
+        monkeypatch.setattr(mcp_server.time, "sleep", lambda _: None)
+
+        result = mcp_server.tool_search(query="anything")
+
+        assert calls["n"] == 2
+        assert "error" in result
+        assert "index_recovered" not in result
+
     def test_list_drawers_rejects_invalid_wing(self, monkeypatch, config, kg):
         _patch_mcp_server(monkeypatch, config, kg)
         from mempalace import mcp_server
@@ -503,9 +573,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert (
-            result1["drawer_id"] != result2["drawer_id"]
-        ), "Documents with shared header but different content must have distinct drawer IDs"
+        assert result1["drawer_id"] != result2["drawer_id"], (
+            "Documents with shared header but different content must have distinct drawer IDs"
+        )
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1243,9 +1313,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert (
-                "embedding_function" in kwargs
-            ), f"missing embedding_function= in chromadb call: {kwargs}"
+            assert "embedding_function" in kwargs, (
+                f"missing embedding_function= in chromadb call: {kwargs}"
+            )
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From 670aba974f7fd537bc48bab63c90599972bb304c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 07:37:25 -0300
Subject: [PATCH 082/127] test(repair): close ChromaBackend in _seed_palace to
 release Windows file locks

The helper opened a chromadb PersistentClient via ChromaBackend and never
closed it, leaving rust-side SQLite/HNSW file locks alive after the
helper returned. On Windows that blocks the in-place archive rename
inside rebuild_from_sqlite with WinError 32 on data_level0.bin,
causing test_rebuild_from_sqlite_in_place_archives_when_opted_in and
test_rebuild_from_sqlite_raises_on_upsert_failure to fail in the
test-windows CI job. No test consumes the returned collection, so
closing the backend in a try/finally is safe and drops the return.
---
 tests/test_repair.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/test_repair.py b/tests/test_repair.py
index 8ca72fb..0277d9d 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -696,20 +696,24 @@ def flaky_detect(*args, **kwargs):
 def _seed_palace(palace_path, collection_name, rows):
     """Build a real chromadb palace at ``palace_path`` and add ``rows``.
 
-    ``rows`` is a list of ``(id, document, metadata)`` tuples. Returns
-    the populated collection so callers can assert on the writer's view
-    of state before the SQLite read.
+    ``rows`` is a list of ``(id, document, metadata)`` tuples.
     """
     from mempalace.backends.chroma import ChromaBackend
 
     backend = ChromaBackend()
-    col = backend.create_collection(str(palace_path), collection_name)
-    col.upsert(
-        ids=[r[0] for r in rows],
-        documents=[r[1] for r in rows],
-        metadatas=[r[2] for r in rows],
-    )
-    return col
+    try:
+        col = backend.create_collection(str(palace_path), collection_name)
+        col.upsert(
+            ids=[r[0] for r in rows],
+            documents=[r[1] for r in rows],
+            metadatas=[r[2] for r in rows],
+        )
+    finally:
+        # Release chromadb's rust-side SQLite/HNSW file locks before the
+        # caller proceeds. Without this, an in-place rebuild on Windows
+        # fails with WinError 32 on data_level0.bin during the archive
+        # rename (cf. PR #1310 test-windows job).
+        backend.close()
 
 
 def test_extract_via_sqlite_returns_all_rows_with_metadata(tmp_path):

From 7cf9b1758222a525e9285f979844797d92c1d59e Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 07:53:28 -0300
Subject: [PATCH 083/127] fix(repair): quote ChromaBackend annotation for
 Python 3.9 compatibility

`backend: ChromaBackend | None = None` evaluates the X | None union
eagerly at function-definition time, which Python 3.9 rejects with
TypeError: unsupported operand type(s) for |: 'ABCMeta' and 'NoneType'
since the new union syntax is 3.10+. Quoting matches the existing
forward-reference style in repair.py (sqlite_drawer_count, etc.) and
defers evaluation, restoring 3.9 compatibility.
---
 mempalace/repair.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mempalace/repair.py b/mempalace/repair.py
index 0585405..d7af0d9 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -593,7 +593,7 @@ def status(palace_path=None) -> dict:
 # ---------------------------------------------------------------------------
 
 
-def _close_chroma_handles(palace_path: str, backend: ChromaBackend | None = None) -> None:
+def _close_chroma_handles(palace_path: str, backend: "ChromaBackend | None" = None) -> None:
     """Drop ChromaBackend + chromadb singleton caches so OS mmap handles release.
 
     When ``backend`` is provided, close the live instance so rollback/restore

From 5488e7bb2286e5d5770d68c2719acefe92efba41 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 08:56:41 -0300
Subject: [PATCH 084/127] fix(miner): harden Windows mine against ONNX
 bad_alloc + silent partial exits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three small changes that together address the failure modes in #1296:

1. Add pnpm-lock.yaml and yarn.lock to SKIP_FILENAMES, mirroring the
   existing package-lock.json rule. A 24K-line pnpm-lock.yaml produced
   ~1124 chunks in one batch and tripped onnxruntime bad_alloc on
   Windows; pnpm/yarn lockfiles are no more useful to mine than npm's.

2. Skip any file that produces more than MAX_CHUNKS_PER_FILE (500)
   chunks, with a clear log line. Catches the broader class — generated
   CSV/JSON, build artifacts, etc. — that the named-file SKIP list will
   never fully cover. The cap is conservative (500 chunks * 800 chars ≈
   400 KB of source) so legitimate hand-written content still mines.

3. Print a partial-progress summary on any exception in _mine_impl, not
   just KeyboardInterrupt, then re-raise. Without this, an arbitrary
   exception (ONNX bad_alloc, chromadb HNSW error, OS fault) propagates
   silently — the operator sees only the last progress line and assumes
   the mine succeeded. The new path mirrors the KeyboardInterrupt
   summary (files_processed, drawers_filed, last_file) plus the
   exception type and message, then re-raises so the original traceback
   surfaces and the exit code is non-zero.

Tests cover: SKIP_FILENAMES contents, the chunk-cap path returning
(0, room) with no upserts, and the new mine-aborted summary surfacing
both the partial counters and the exception class.
---
 mempalace/miner.py  | 34 ++++++++++++++++++++
 tests/test_miner.py | 77 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/mempalace/miner.py b/mempalace/miner.py
index 88734c9..6aeddd4 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -66,6 +66,8 @@
     "mempal.yml",
     ".gitignore",
     "package-lock.json",
+    "pnpm-lock.yaml",
+    "yarn.lock",
 }
 
 CHUNK_SIZE = 800  # chars per drawer
@@ -73,6 +75,13 @@
 MIN_CHUNK_SIZE = 50  # skip tiny chunks
 DRAWER_UPSERT_BATCH_SIZE = 1000
 MAX_FILE_SIZE = 500 * 1024 * 1024  # 500 MB — skip files larger than this.
+# A single file producing more chunks than this is almost always a generated
+# artifact (CSV/JSON dump, lockfile not in SKIP_FILENAMES, etc.). Embedding
+# thousands of chunks from one file in one batch has triggered ONNX runtime
+# `bad allocation` errors on Windows (#1296). The cap is conservative: a
+# 500-chunk file at CHUNK_SIZE=800 is ~400 KB of source, which covers most
+# legitimate hand-written content while bounding the worst-case batch.
+MAX_CHUNKS_PER_FILE = 500
 # Long Claude Code sessions and large transcript exports routinely exceed
 # 10 MB. The cap exists as a defensive rail against pathological binary
 # files, not as a limit on legitimate text. Per-drawer size is bounded
@@ -825,6 +834,13 @@ def process_file(
     room = detect_room(filepath, content, rooms, project_path)
     chunks = chunk_text(content, source_file)
 
+    if len(chunks) > MAX_CHUNKS_PER_FILE:
+        print(
+            f"  ! [skip] {filepath.name[:50]:50} produced {len(chunks)} chunks "
+            f"(> {MAX_CHUNKS_PER_FILE}); add to SKIP_FILENAMES or .gitignore"
+        )
+        return 0, room
+
     if dry_run:
         print(f"    [DRY RUN] {filepath.name} -> room:{room} ({len(chunks)} drawers)")
         return len(chunks), room
@@ -1167,6 +1183,24 @@ def _mine_impl(
             "already-filed drawers are\n  upserted idempotently and will not duplicate.\n"
         )
         sys.exit(130)
+    except Exception as exc:
+        # Without this, an arbitrary exception (ONNX bad_alloc, chromadb HNSW
+        # error, OS fault) propagates and the process exits with no completion
+        # banner — the operator sees only the final progress line and assumes
+        # the mine succeeded (#1296). Print the partial-progress summary the
+        # way we do for KeyboardInterrupt, then re-raise so the original
+        # traceback still surfaces and the exit code is non-zero.
+        print("\n\n  Mine aborted by exception.")
+        print(f"    files_processed: {files_processed}/{len(files)}")
+        print(f"    drawers_filed:   {total_drawers}")
+        print(f"    last_file:       {last_file or '<none>'}")
+        print(f"    error:           {type(exc).__name__}: {exc}")
+        print(
+            f"\n  Re-run `mempalace mine {shlex.quote(project_dir)}` after addressing "
+            "the cause — already-filed\n  drawers are upserted idempotently and will "
+            "not duplicate.\n"
+        )
+        raise
     finally:
         # Clean up the hooks-side PID lock if it points at us. Stale
         # entries already pass _pid_alive() == False on POSIX, but
diff --git a/tests/test_miner.py b/tests/test_miner.py
index 10124ee..10dd33d 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -699,6 +699,83 @@ def fake_process_file(*args, **kwargs):
     assert f"mempalace mine {shlex.quote(str(project_root))}" in out
 
 
+def test_skip_filenames_includes_lockfiles():
+    """pnpm-lock.yaml and yarn.lock must be skipped alongside package-lock.json
+    so a Windows mine over a typical JS monorepo doesn't OOM the ONNX embedder
+    on a 24K-line lockfile (#1296)."""
+    from mempalace import miner
+
+    assert "package-lock.json" in miner.SKIP_FILENAMES
+    assert "pnpm-lock.yaml" in miner.SKIP_FILENAMES
+    assert "yarn.lock" in miner.SKIP_FILENAMES
+
+
+def test_process_file_skips_when_chunks_exceed_max(tmp_path, monkeypatch):
+    """A file producing more than MAX_CHUNKS_PER_FILE chunks must be skipped
+    with a clear message and zero upserts. Generated artifacts (CSVs, lock
+    files not in SKIP_FILENAMES) hit this — the cap is what prevents ONNX
+    bad_alloc on Windows when the embedder is asked to swallow thousands of
+    chunks in one batch (#1296)."""
+    from unittest.mock import MagicMock
+
+    from mempalace import miner
+
+    monkeypatch.setattr(miner, "MAX_CHUNKS_PER_FILE", 5)
+    over_cap = [{"content": f"chunk {i}", "chunk_index": i} for i in range(7)]
+    monkeypatch.setattr(miner, "chunk_text", lambda content, source_file: over_cap)
+
+    source = tmp_path / "huge.csv"
+    source.write_text("col1,col2\n" + "x,y\n" * 500, encoding="utf-8")
+    col = MagicMock()
+    col.get.return_value = {"ids": []}
+
+    drawers, room = miner.process_file(
+        source,
+        tmp_path,
+        col,
+        "wing",
+        [{"name": "general", "description": "General"}],
+        "agent",
+        False,
+    )
+
+    assert drawers == 0
+    col.upsert.assert_not_called()
+
+
+def test_mine_arbitrary_exception_prints_summary_and_reraises(tmp_path, capsys):
+    """A non-KeyboardInterrupt exception mid-mine must surface a summary
+    banner before propagating, so users don't see a silent exit-0 with no
+    completion message (#1296 Failure 2). Re-raise preserves the traceback
+    and yields a non-zero exit code."""
+    import pytest
+    from unittest.mock import patch
+
+    project_root = tmp_path / "proj"
+    project_root.mkdir()
+    _make_minable_project(project_root, n_files=4)
+    palace_path = project_root / "palace"
+
+    call_count = {"n": 0}
+
+    def fake_process_file(*args, **kwargs):
+        call_count["n"] += 1
+        if call_count["n"] == 2:
+            raise RuntimeError("simulated ONNX bad_alloc")
+        return (1, "general")
+
+    with patch("mempalace.miner.process_file", side_effect=fake_process_file):
+        with pytest.raises(RuntimeError, match="simulated ONNX bad_alloc"):
+            mine(str(project_root), str(palace_path))
+
+    out = capsys.readouterr().out
+    assert "Mine aborted by exception." in out
+    assert "files_processed: 1/" in out
+    assert "drawers_filed:" in out
+    assert "RuntimeError: simulated ONNX bad_alloc" in out
+    assert "upserted idempotently" in out
+
+
 def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
     """Our own PID entry in mine.pid is removed in the finally clause."""
     import pytest

From ec6d2dde0170387804730d57f13ea17db6c7c5f8 Mon Sep 17 00:00:00 2001
From: Mika Cohen <mjc@kernel.org>
Date: Sat, 2 May 2026 00:16:29 -0600
Subject: [PATCH 085/127] fix: use configured collection in recovery paths

---
 mempalace/cli.py            |  8 +++-
 mempalace/config.py         |  8 ++++
 mempalace/mcp_server.py     | 61 ++++++++++++++++++++----
 mempalace/palace.py         |  7 ++-
 mempalace/repair.py         | 59 ++++++++++++++++--------
 mempalace/searcher.py       | 49 +++++++++++++-------
 tests/test_backends.py      | 23 ++++++++++
 tests/test_cli.py           | 44 ++++++++++++++++++
 tests/test_hnsw_capacity.py |  2 +
 tests/test_mcp_server.py    | 45 ++++++++++++++++--
 tests/test_repair.py        | 92 +++++++++++++++++++++++++++++++++++++
 tests/test_searcher.py      | 26 +++++++++--
 12 files changed, 370 insertions(+), 54 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 740de96..ac00283 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -663,8 +663,10 @@ def cmd_repair(args):
         check_extraction_safety,
     )
 
+    config = MempalaceConfig()
+    collection_name = config.collection_name
     palace_path = os.path.abspath(
-        os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+        os.path.expanduser(args.palace) if args.palace else config.palace_path
     )
 
     if getattr(args, "mode", "legacy") == "max-seq-id":
@@ -749,7 +751,7 @@ def cmd_repair(args):
 
     # Try to read existing drawers
     try:
-        col = backend.get_collection(palace_path, "mempalace_drawers")
+        col = backend.get_collection(palace_path, collection_name)
         total = col.count()
         print(f"  Drawers found: {total}")
     except Exception as e:
@@ -784,6 +786,7 @@ def cmd_repair(args):
             palace_path,
             len(all_ids),
             confirm_truncation_ok=getattr(args, "confirm_truncation_ok", False),
+            collection_name=collection_name,
         )
     except TruncationDetected as e:
         print(e.message)
@@ -810,6 +813,7 @@ def cmd_repair(args):
             all_docs,
             all_metas,
             batch_size,
+            collection_name=collection_name,
             progress=print,
         )
     except RebuildCollectionError as e:
diff --git a/mempalace/config.py b/mempalace/config.py
index 2252a49..fd32a17 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -7,6 +7,7 @@
 import json
 import os
 import re
+from functools import lru_cache
 from pathlib import Path
 
 
@@ -127,6 +128,13 @@ def sanitize_content(value: str, max_length: int = 100_000) -> str:
 DEFAULT_PALACE_PATH = os.path.expanduser("~/.mempalace/palace")
 DEFAULT_COLLECTION_NAME = "mempalace_drawers"
 
+
+@lru_cache(maxsize=1)
+def get_configured_collection_name() -> str:
+    """Return the configured drawer collection name without repeated config-file reads."""
+    return MempalaceConfig().collection_name
+
+
 DEFAULT_TOPIC_WINGS = [
     "emotions",
     "consciousness",
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index bbb9c93..521cb07 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -193,7 +193,7 @@ def _refresh_vector_disabled_flag() -> None:
     """
     global _vector_disabled, _vector_disabled_reason, _vector_capacity_status
     try:
-        info = hnsw_capacity_status(_config.palace_path, "mempalace_drawers")
+        info = hnsw_capacity_status(_config.palace_path, _config.collection_name)
     except Exception:
         logger.debug("HNSW capacity probe raised", exc_info=True)
         return
@@ -490,6 +490,7 @@ def _tool_status_via_sqlite() -> dict:
     db_path = os.path.join(_config.palace_path, "chroma.sqlite3")
     if not os.path.isfile(db_path):
         return _no_palace()
+    collection_name = _config.collection_name
 
     wings: dict = {}
     rooms: dict = {}
@@ -503,8 +504,9 @@ def _tool_status_via_sqlite() -> dict:
                 FROM embeddings e
                 JOIN segments s ON e.segment_id = s.id
                 JOIN collections c ON s.collection = c.id
-                WHERE c.name = 'mempalace_drawers'
-                """
+                WHERE c.name = ?
+                """,
+                (collection_name,),
             ).fetchone()
             total = int(row[0]) if row and row[0] is not None else 0
             for key, target in (("wing", wings), ("room", rooms)):
@@ -515,12 +517,12 @@ def _tool_status_via_sqlite() -> dict:
                     JOIN embeddings e ON em.id = e.id
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                       AND em.key = ?
                       AND em.string_value IS NOT NULL
                     GROUP BY em.string_value
                     """,
-                    (key,),
+                    (collection_name, key),
                 ):
                     target[value] = count
         finally:
@@ -720,6 +722,7 @@ def tool_search(
         n_results=limit,
         max_distance=dist,
         vector_disabled=_vector_disabled,
+        collection_name=_config.collection_name,
     )
     if _vector_disabled:
         result["vector_disabled"] = True
@@ -922,8 +925,8 @@ def tool_add_drawer(
 
     # Idempotency: if the deterministic ID already exists, return success as a no-op.
     try:
-        existing = col.get(ids=[drawer_id])
-        if existing and existing["ids"]:
+        existing = col.get(ids=[drawer_id], include=[])
+        if existing.ids:
             return {"success": True, "reason": "already_exists", "drawer_id": drawer_id}
     except Exception:
         logger.debug("Idempotency pre-check failed for %s", drawer_id, exc_info=True)
@@ -943,6 +946,12 @@ def tool_add_drawer(
                 }
             ],
         )
+        inserted = col.get(ids=[drawer_id], include=[])
+        if not inserted.ids:
+            raise RuntimeError(
+                "Drawer write was acknowledged but the new ID is not readable. "
+                "The palace index may be stale; run reconnect or repair."
+            )
         _metadata_cache = None
         logger.info(f"Filed drawer: {drawer_id} → {wing}/{room}")
         return {"success": True, "drawer_id": drawer_id, "wing": wing, "room": room}
@@ -1506,6 +1515,30 @@ def tool_reconnect():
         _palace_db_mtime, \
         _vector_disabled, \
         _vector_disabled_reason
+    from . import palace as palace_module
+
+    close_errors = []
+    try:
+        palace_module._DEFAULT_BACKEND.close_palace(_config.palace_path)
+    except Exception as exc:
+        logger.debug("Failed to close shared palace backend during reconnect", exc_info=True)
+        close_errors.append(f"backend close_palace failed: {exc}")
+    try:
+        from chromadb.api.client import SharedSystemClient
+
+        clear_system_cache = getattr(SharedSystemClient, "clear_system_cache", None)
+        if callable(clear_system_cache):
+            clear_system_cache()
+        else:
+            logger.debug(
+                "SharedSystemClient.clear_system_cache is unavailable; skipping shared Chroma cache clear during reconnect"
+            )
+    except Exception as exc:
+        logger.debug(
+            "Failed to clear Chroma shared system cache during reconnect",
+            exc_info=True,
+        )
+        close_errors.append(f"shared Chroma cache clear failed: {exc}")
     _client_cache = None
     _collection_cache = None
     _palace_db_inode = 0
@@ -1527,12 +1560,24 @@ def tool_reconnect():
     try:
         col = _get_collection()
         if col is None:
-            return {
+            result = {
                 "success": False,
                 "message": "No palace found after reconnect",
                 "drawers": 0,
                 "vector_disabled": _vector_disabled,
             }
+            if close_errors:
+                result["error"] = "; ".join(close_errors)
+            return result
+        if close_errors:
+            return {
+                "success": False,
+                "message": "Reconnect reopened the palace but failed to fully reset cached handles",
+                "drawers": col.count(),
+                "vector_disabled": _vector_disabled,
+                "vector_disabled_reason": _vector_disabled_reason,
+                "error": "; ".join(close_errors),
+            }
         return {
             "success": True,
             "message": "Reconnected to palace",
diff --git a/mempalace/palace.py b/mempalace/palace.py
index e5f6411..dee5c8f 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -10,6 +10,7 @@
 import os
 import re
 import threading
+from typing import Optional
 
 from .backends.chroma import ChromaBackend
 
@@ -56,10 +57,14 @@
 
 def get_collection(
     palace_path: str,
-    collection_name: str = "mempalace_drawers",
+    collection_name: Optional[str] = None,
     create: bool = True,
 ):
     """Get the palace collection through the backend layer."""
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()
     return _DEFAULT_BACKEND.get_collection(
         palace_path,
         collection_name=collection_name,
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 34d165c..b47bcd6 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -181,10 +181,12 @@ def _rebuild_collection_via_temp(
     all_docs,
     all_metas,
     batch_size: int,
+    collection_name: Optional[str] = None,
     progress=print,
 ) -> int:
     expected = len(all_ids)
-    temp_name = REPAIR_TEMP_COLLECTION
+    collection_name = collection_name or _drawers_collection_name()
+    temp_name = f"{collection_name}__repair_tmp"
     live_replaced = False
 
     try:
@@ -203,9 +205,9 @@ def _rebuild_collection_via_temp(
         _verify_collection_count(temp_col, expected, "temporary rebuild")
 
         progress("  Rebuilding live collection...")
-        backend.delete_collection(palace_path, COLLECTION_NAME)
+        backend.delete_collection(palace_path, collection_name)
         live_replaced = True
-        new_col = backend.create_collection(palace_path, COLLECTION_NAME)
+        new_col = backend.create_collection(palace_path, collection_name)
 
         rebuilt = 0
         for i in range(0, expected, batch_size):
@@ -230,7 +232,7 @@ def _rebuild_collection_via_temp(
         raise RebuildCollectionError(str(exc), live_replaced=live_replaced) from exc
 
 
-def scan_palace(palace_path=None, only_wing=None):
+def scan_palace(palace_path=None, only_wing=None, collection_name: Optional[str] = None):
     """Scan the palace for corrupt/unfetchable IDs.
 
     Probes in batches of 100, falls back to per-ID on failure.
@@ -239,14 +241,15 @@ def scan_palace(palace_path=None, only_wing=None):
     Returns (good_set, bad_set).
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     print(f"\n  Palace: {palace_path}")
     print("  Loading...")
 
-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
 
     where = {"wing": only_wing} if only_wing else None
     total = col.count()
-    print(f"  Collection: {COLLECTION_NAME}, total: {total:,}")
+    print(f"  Collection: {collection_name}, total: {total:,}")
     if only_wing:
         print(f"  Scanning wing: {only_wing}")
 
@@ -307,9 +310,10 @@ def scan_palace(palace_path=None, only_wing=None):
     return good_set, bad_set
 
 
-def prune_corrupt(palace_path=None, confirm=False):
+def prune_corrupt(palace_path=None, confirm=False, collection_name: Optional[str] = None):
     """Delete corrupt IDs listed in corrupt_ids.txt."""
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     bad_file = os.path.join(palace_path, "corrupt_ids.txt")
 
     if not os.path.exists(bad_file):
@@ -325,7 +329,7 @@ def prune_corrupt(palace_path=None, confirm=False):
         print("  Re-run with --confirm to actually delete.")
         return
 
-    col = ChromaBackend().get_collection(palace_path, COLLECTION_NAME)
+    col = ChromaBackend().get_collection(palace_path, collection_name)
     before = col.count()
     print(f"  Collection size before: {before:,}")
 
@@ -379,7 +383,10 @@ def __init__(self, message: str, sqlite_count: "int | None", extracted: int):
 
 
 def check_extraction_safety(
-    palace_path: str, extracted: int, confirm_truncation_ok: bool = False
+    palace_path: str,
+    extracted: int,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
 ) -> None:
     """Cross-check that ``extracted`` matches the SQLite ground truth.
 
@@ -401,7 +408,8 @@ def check_extraction_safety(
     if confirm_truncation_ok:
         return
 
-    sqlite_count = sqlite_drawer_count(palace_path)
+    collection_name = collection_name or _drawers_collection_name()
+    sqlite_count = sqlite_drawer_count(palace_path, collection_name)
     cap_signal = extracted == CHROMADB_DEFAULT_GET_LIMIT
 
     if sqlite_count is not None and sqlite_count > extracted:
@@ -437,7 +445,7 @@ def check_extraction_safety(
         raise TruncationDetected(message, sqlite_count, extracted)
 
 
-def sqlite_drawer_count(palace_path: str) -> "int | None":
+def sqlite_drawer_count(palace_path: str, collection_name: Optional[str] = None) -> "int | None":
     """Count rows in ``chroma.sqlite3.embeddings`` for the drawers collection.
 
     Used as an independent ground-truth check against the chromadb
@@ -449,6 +457,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
     drift, missing tables, locked file). Callers treat ``None`` as
     "unknown" and fall back to the cap-detection check.
     """
+    collection_name = collection_name or _drawers_collection_name()
     sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
     if not os.path.exists(sqlite_path):
         return None
@@ -465,7 +474,7 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
                 JOIN collections c ON s.collection = c.id
                 WHERE c.name = ?
                 """,
-                (COLLECTION_NAME,),
+                (collection_name,),
             ).fetchone()
             return int(row[0]) if row and row[0] is not None else None
         finally:
@@ -477,7 +486,11 @@ def sqlite_drawer_count(palace_path: str) -> "int | None":
         return None
 
 
-def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
+def rebuild_index(
+    palace_path=None,
+    confirm_truncation_ok: bool = False,
+    collection_name: Optional[str] = None,
+):
     """Rebuild the HNSW index from scratch.
 
     1. Extract all drawers via ChromaDB get()
@@ -492,6 +505,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     (typically only a concern for palaces sized at exactly 10 000 rows).
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
 
     if not os.path.isdir(palace_path):
         print(f"\n  No palace found at {palace_path}")
@@ -504,7 +518,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
 
     backend = ChromaBackend()
     try:
-        col = backend.get_collection(palace_path, COLLECTION_NAME)
+        col = backend.get_collection(palace_path, collection_name)
         total = col.count()
     except Exception as e:
         print(f"  Error reading palace: {e}")
@@ -528,7 +542,12 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
     # short of the SQLite ground truth (or when extraction == chromadb
     # default get() cap and the SQLite check couldn't run).
     try:
-        check_extraction_safety(palace_path, len(all_ids), confirm_truncation_ok)
+        check_extraction_safety(
+            palace_path,
+            len(all_ids),
+            confirm_truncation_ok,
+            collection_name=collection_name,
+        )
     except TruncationDetected as e:
         print(e.message)
         return
@@ -551,6 +570,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
             all_docs,
             all_metas,
             batch_size,
+            collection_name=collection_name,
             progress=print,
         )
     except RebuildCollectionError as e:
@@ -560,7 +580,7 @@ def rebuild_index(palace_path=None, confirm_truncation_ok: bool = False):
             print(f"  Restoring from backup: {backup_path}")
             try:
                 _close_chroma_handles(palace_path, backend=backend)
-                _delete_collection_if_exists(backend, palace_path, COLLECTION_NAME)
+                _delete_collection_if_exists(backend, palace_path, collection_name)
                 shutil.copy2(backup_path, sqlite_path)
                 print("  Backup restored. Palace is back to pre-repair state.")
             except Exception as restore_error:
@@ -950,7 +970,7 @@ def rebuild_from_sqlite(
         backend.close()
 
 
-def status(palace_path=None) -> dict:
+def status(palace_path=None, collection_name: Optional[str] = None) -> dict:
     """Read-only health check: compare sqlite vs HNSW element counts.
 
     Catches the #1222 failure mode where chromadb's HNSW segment freezes
@@ -968,6 +988,7 @@ def status(palace_path=None) -> dict:
     ``status="unknown"`` when no palace exists at the given path.
     """
     palace_path = palace_path or _get_palace_path()
+    collection_name = collection_name or _drawers_collection_name()
     print(f"\n{'=' * 55}")
     print("  MemPalace Repair — Status")
     print(f"{'=' * 55}\n")
@@ -977,8 +998,8 @@ def status(palace_path=None) -> dict:
         print("  No palace found.\n")
         return {"status": "unknown", "message": "no palace at path"}
 
-    drawers = hnsw_capacity_status(palace_path, "mempalace_drawers")
-    closets = hnsw_capacity_status(palace_path, "mempalace_closets")
+    drawers = hnsw_capacity_status(palace_path, collection_name)
+    closets = hnsw_capacity_status(palace_path, CLOSETS_COLLECTION_NAME)
 
     for label, info in (("drawers", drawers), ("closets", closets)):
         print(f"\n  [{label}]")
diff --git a/mempalace/searcher.py b/mempalace/searcher.py
index b318d99..f644fda 100644
--- a/mempalace/searcher.py
+++ b/mempalace/searcher.py
@@ -382,6 +382,7 @@ def _bm25_only_via_sqlite(
     n_results: int = 5,
     max_candidates: int = 500,
     _include_internal: bool = False,
+    collection_name: str = None,
 ) -> dict:
     """BM25-only search reading drawers directly from chroma.sqlite3.
 
@@ -405,6 +406,10 @@ def _bm25_only_via_sqlite(
             "error": "No palace found",
             "hint": "Run: mempalace init <dir> && mempalace mine <dir>",
         }
+    if collection_name is None:
+        from .config import get_configured_collection_name
+
+        collection_name = get_configured_collection_name()
 
     def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
         clauses = []
@@ -441,35 +446,43 @@ def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
         # shorter than 3 chars (trigram tokenizer can't match them).
         tokens = [t for t in _tokenize(query) if len(t) >= 3]
         candidate_ids: list[int] = []
+        use_recency_fallback = not tokens
         if tokens:
             fts_query = " OR ".join(tokens)
             filter_sql, filter_params = _metadata_filter_sql("embedding_fulltext_search.rowid")
             try:
                 rows = conn.execute(
                     f"""
-                    SELECT rowid
+                    SELECT embedding_fulltext_search.rowid
                     FROM embedding_fulltext_search
+                    JOIN embeddings e ON e.id = embedding_fulltext_search.rowid
+                    JOIN segments s ON e.segment_id = s.id
+                    JOIN collections c ON s.collection = c.id
                     WHERE embedding_fulltext_search MATCH ?
+                      AND c.name = ?
                     {filter_sql}
                     LIMIT ?
                     """,
-                    (fts_query, *filter_params, max_candidates),
+                    (fts_query, collection_name, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
                 # FTS5 tokenizer mismatch or syntax error — fall through
                 # to the recency-window selector below.
                 logger.debug("FTS5 MATCH failed; using recency fallback", exc_info=True)
-
-        if not candidate_ids:
-            # No FTS hits (or no usable tokens) — pull the most recent
-            # rows for the drawers segment so we can BM25-rank something
-            # rather than return empty-handed. Wrapped in try/except
-            # because the schema may differ on legacy palaces (older
-            # chromadb without ``created_at``, missing ``segments``
-            # rows after partial restore, etc.); on schema mismatch we
-            # fall back to ordering by primary-key id and finally to an
-            # empty result rather than letting search raise.
+                use_recency_fallback = True
+
+        if not candidate_ids and use_recency_fallback:
+            # No usable FTS tokens, or FTS itself failed — pull the most
+            # recent rows for the drawers segment so we can BM25-rank
+            # something rather than return empty-handed. A clean FTS miss
+            # must stay empty, especially after wing/room filtering, because
+            # recency fallback would return unrelated scoped drawers.
+            # Wrapped in try/except because the schema may differ on legacy
+            # palaces (older chromadb without ``created_at``, missing
+            # ``segments`` rows after partial restore, etc.); on schema
+            # mismatch we fall back to ordering by primary-key id and finally
+            # to an empty result rather than letting search raise.
             try:
                 filter_sql, filter_params = _metadata_filter_sql("e.id")
                 rows = conn.execute(
@@ -478,12 +491,12 @@ def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
                     FROM embeddings e
                     JOIN segments s ON e.segment_id = s.id
                     JOIN collections c ON s.collection = c.id
-                    WHERE c.name = 'mempalace_drawers'
+                    WHERE c.name = ?
                     {filter_sql}
                     ORDER BY e.created_at DESC
                     LIMIT ?
                     """,
-                    (*filter_params, max_candidates),
+                    (collection_name, *filter_params, max_candidates),
                 ).fetchall()
                 candidate_ids = [r[0] for r in rows]
             except sqlite3.Error:
@@ -499,12 +512,12 @@ def _metadata_filter_sql(row_id_expr: str) -> tuple[str, list[str]]:
                         FROM embeddings e
                         JOIN segments s ON e.segment_id = s.id
                         JOIN collections c ON s.collection = c.id
-                        WHERE c.name = 'mempalace_drawers'
+                        WHERE c.name = ?
                         {filter_sql}
                         ORDER BY e.id DESC
                         LIMIT ?
                         """,
-                        (*filter_params, max_candidates),
+                        (collection_name, *filter_params, max_candidates),
                     ).fetchall()
                     candidate_ids = [r[0] for r in rows]
                 except sqlite3.Error:
@@ -720,6 +733,7 @@ def search_memories(
     max_distance: float = 0.0,
     vector_disabled: bool = False,
     candidate_strategy: str = "vector",
+    collection_name: str = None,
 ) -> dict:
     """Programmatic search — returns a dict instead of printing.
 
@@ -770,10 +784,11 @@ def search_memories(
             wing=wing,
             room=room,
             n_results=n_results,
+            collection_name=collection_name,
         )
 
     try:
-        drawers_col = get_collection(palace_path, create=False)
+        drawers_col = get_collection(palace_path, collection_name=collection_name, create=False)
     except Exception as e:
         logger.error("No palace found at %s: %s", palace_path, e)
         return {
diff --git a/tests/test_backends.py b/tests/test_backends.py
index 4cd9480..06625fc 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -1194,3 +1194,26 @@ class DummyClient:
         ("invalid", str(palace)),
         ("stale", str(palace)),
     ]
+
+
+def test_palace_get_collection_uses_configured_collection_name(monkeypatch):
+    from mempalace import palace
+
+    captured = {}
+
+    def fake_get_collection(palace_path, collection_name=None, create=False):
+        captured["palace_path"] = palace_path
+        captured["collection_name"] = collection_name
+        captured["create"] = create
+        return object()
+
+    monkeypatch.setattr(palace._DEFAULT_BACKEND, "get_collection", fake_get_collection)
+    monkeypatch.setattr("mempalace.config.get_configured_collection_name", lambda: "custom_drawers")
+
+    palace.get_collection("/palace", create=False)
+
+    assert captured == {
+        "palace_path": "/palace",
+        "collection_name": "custom_drawers",
+        "create": False,
+    }
diff --git a/tests/test_cli.py b/tests/test_cli.py
index de00664..0b61b0c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -776,6 +776,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_backend = MagicMock()
     mock_backend.get_collection.side_effect = Exception("corrupt db")
@@ -791,6 +792,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_col = MagicMock()
     mock_col.count.return_value = 0
@@ -807,6 +809,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
     mock_col = MagicMock()
     mock_col.count.return_value = 2
@@ -836,12 +839,52 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
     mock_new_col.add.assert_not_called()
 
 
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
+    palace_dir = tmp_path / "palace"
+    palace_dir.mkdir()
+    (palace_dir / "chroma.sqlite3").write_text("db")
+    mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "custom_drawers"
+    args = argparse.Namespace(palace=None, yes=True)
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _mock_backend_for(col=mock_col, new_col=mock_new_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with patch("mempalace.backends.chroma.ChromaBackend", return_value=mock_backend):
+        cmd_repair(args)
+
+    out = capsys.readouterr().out
+    assert "Repair complete" in out
+    mock_backend.get_collection.assert_called_once_with(str(palace_dir), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+        call(str(palace_dir), "custom_drawers"),
+        call(str(palace_dir), "custom_drawers__repair_tmp"),
+    ]
+
+
 @patch("mempalace.cli.MempalaceConfig")
 def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
     mock_col = MagicMock()
     mock_col.count.return_value = 2
@@ -875,6 +918,7 @@ def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsy
     palace_dir.mkdir()
     (palace_dir / "chroma.sqlite3").write_text("db")
     mock_config_cls.return_value.palace_path = str(palace_dir)
+    mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
     mock_col = MagicMock()
     mock_col.count.return_value = 1
diff --git a/tests/test_hnsw_capacity.py b/tests/test_hnsw_capacity.py
index 912def8..53775b0 100644
--- a/tests/test_hnsw_capacity.py
+++ b/tests/test_hnsw_capacity.py
@@ -260,6 +260,7 @@ def test_mcp_probe_does_not_disable_vectors_for_unflushed_metadata(tmp_path, mon
 
     class _Cfg:
         palace_path = str(tmp_path)
+        collection_name = "mempalace_drawers"
 
     monkeypatch.setattr(mcp_server, "_config", _Cfg())
     monkeypatch.setattr(mcp_server, "_vector_disabled", True)
@@ -625,6 +626,7 @@ def test_tool_status_via_sqlite_returns_breakdown(palace_with_drawers, monkeypat
     # MempalaceConfig.
     class _Cfg:
         palace_path = str(palace_with_drawers)
+        collection_name = "mempalace_drawers"
 
     monkeypatch.setattr(mcp_server, "_config", _Cfg())
     monkeypatch.setattr(mcp_server, "_vector_disabled", True)
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index ae20bf3..613cc23 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -484,6 +484,26 @@ def test_add_drawer_duplicate_detection(self, monkeypatch, config, palace_path,
         assert result2["success"] is True
         assert result2["reason"] == "already_exists"
 
+    def test_add_drawer_fails_when_readback_misses(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from mempalace import mcp_server
+
+        class _FakeGetResult:
+            ids = []
+
+        class _FakeCol:
+            def get(self, **kwargs):
+                return _FakeGetResult()
+
+            def upsert(self, **kwargs):
+                return None
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_add_drawer("w", "r", "content")
+        assert result["success"] is False
+        assert "not readable" in result["error"]
+
     def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace_path, kg):
         """Documents sharing a >100-char header must get distinct IDs (full-content hash)."""
         _patch_mcp_server(monkeypatch, config, kg)
@@ -503,9 +523,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert (
-            result1["drawer_id"] != result2["drawer_id"]
-        ), "Documents with shared header but different content must have distinct drawer IDs"
+        assert result1["drawer_id"] != result2["drawer_id"], (
+            "Documents with shared header but different content must have distinct drawer IDs"
+        )
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1158,6 +1178,25 @@ def test_reconnect_reports_success(self, monkeypatch, config, palace_path, kg):
         assert "Reconnected" in result["message"]
         assert isinstance(result["drawers"], int)
 
+    def test_reconnect_closes_shared_backend(self, monkeypatch, config, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+        from unittest.mock import MagicMock
+
+        from mempalace import mcp_server, palace
+
+        close_palace = MagicMock()
+        monkeypatch.setattr(palace._DEFAULT_BACKEND, "close_palace", close_palace)
+
+        class _FakeCol:
+            def count(self):
+                return 7
+
+        monkeypatch.setattr(mcp_server, "_get_collection", lambda create=False: _FakeCol())
+
+        result = mcp_server.tool_reconnect()
+        assert result["success"] is True
+        close_palace.assert_called_once_with(config.palace_path)
+
     def test_get_collection_create_true_avoids_get_or_create_on_reopen(
         self, monkeypatch, config, palace_path, kg
     ):
diff --git a/tests/test_repair.py b/tests/test_repair.py
index a60836a..8e9f95b 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -28,6 +28,16 @@ def test_get_palace_path_fallback():
         assert ".mempalace" in result
 
 
+def test_get_collection_name_from_config():
+    from mempalace.config import get_configured_collection_name
+
+    get_configured_collection_name.cache_clear()
+    with patch("mempalace.config.MempalaceConfig") as mock_config_cls:
+        mock_config_cls.return_value.collection_name = "custom_drawers"
+        assert repair._drawers_collection_name() == "custom_drawers"
+    get_configured_collection_name.cache_clear()
+
+
 # ── _paginate_ids ─────────────────────────────────────────────────────
 
 
@@ -330,6 +340,21 @@ def test_check_extraction_safety_passes_when_counts_match(tmp_path):
         repair.check_extraction_safety(str(tmp_path), 500)
 
 
+def test_check_extraction_safety_uses_configured_collection(tmp_path):
+    with patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count:
+        repair.check_extraction_safety(str(tmp_path), 500, collection_name="custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
+def test_check_extraction_safety_default_uses_configured_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=500) as count,
+    ):
+        repair.check_extraction_safety(str(tmp_path), 500)
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+
+
 def test_check_extraction_safety_passes_when_sqlite_unreadable_and_under_cap(tmp_path):
     """SQLite check fails (None) but extraction is well under the cap → safe."""
     with patch("mempalace.repair.sqlite_drawer_count", return_value=None):
@@ -384,6 +409,73 @@ def test_sqlite_drawer_count_returns_none_on_unreadable_schema(tmp_path):
     assert repair.sqlite_drawer_count(str(tmp_path)) is None
 
 
+@patch("mempalace.repair.shutil")
+@patch("mempalace.repair.ChromaBackend")
+def test_rebuild_index_default_uses_configured_collection(mock_backend_cls, mock_shutil, tmp_path):
+    sqlite_path = tmp_path / "chroma.sqlite3"
+    sqlite_path.write_text("fake")
+    mock_col = MagicMock()
+    mock_col.count.return_value = 2
+    mock_col.get.return_value = {
+        "ids": ["id1", "id2"],
+        "documents": ["doc1", "doc2"],
+        "metadatas": [{"wing": "a"}, {"wing": "b"}],
+    }
+    mock_temp_col = MagicMock()
+    mock_temp_col.count.return_value = 2
+    mock_new_col = MagicMock()
+    mock_new_col.count.return_value = 2
+    mock_backend = _install_mock_backend(mock_backend_cls, mock_col)
+    mock_backend.create_collection.side_effect = [mock_temp_col, mock_new_col]
+
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.sqlite_drawer_count", return_value=2) as count,
+    ):
+        repair.rebuild_index(palace_path=str(tmp_path))
+
+    mock_backend.get_collection.assert_called_once_with(str(tmp_path), "custom_drawers")
+    count.assert_called_once_with(str(tmp_path), "custom_drawers")
+    assert mock_backend.create_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+    ]
+    assert mock_backend.delete_collection.call_args_list == [
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+        call(str(tmp_path), "custom_drawers"),
+        call(str(tmp_path), "custom_drawers__repair_tmp"),
+    ]
+
+
+def test_status_default_uses_configured_drawer_collection(tmp_path):
+    with (
+        patch("mempalace.repair._drawers_collection_name", return_value="custom_drawers"),
+        patch("mempalace.repair.hnsw_capacity_status") as capacity_status,
+    ):
+        capacity_status.side_effect = [
+            {
+                "sqlite_count": 1,
+                "hnsw_count": 1,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+            {
+                "sqlite_count": 0,
+                "hnsw_count": 0,
+                "divergence": 0,
+                "diverged": False,
+                "status": "ok",
+                "message": "",
+            },
+        ]
+        repair.status(palace_path=str(tmp_path))
+
+    assert capacity_status.call_args_list[0].args == (str(tmp_path), "custom_drawers")
+    assert capacity_status.call_args_list[1].args == (str(tmp_path), "mempalace_closets")
+
+
 @patch("mempalace.repair.shutil")
 @patch("mempalace.repair.ChromaBackend")
 def test_rebuild_index_aborts_on_truncation_signal(mock_backend_cls, mock_shutil, tmp_path):
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 4f0b4c0..60bef9f 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -84,6 +84,24 @@ def test_search_memories_query_error(self):
         assert "error" in result
         assert "query failed" in result["error"]
 
+    def test_search_memories_vector_path_uses_explicit_collection_name(self):
+        mock_col = MagicMock()
+        mock_col.query.return_value = {
+            "documents": [[]],
+            "metadatas": [[]],
+            "distances": [[]],
+            "ids": [[]],
+        }
+
+        with patch("mempalace.searcher.get_collection", return_value=mock_col) as get_collection:
+            search_memories("test", "/fake/path", collection_name="custom_drawers")
+
+        get_collection.assert_called_once_with(
+            "/fake/path",
+            collection_name="custom_drawers",
+            create=False,
+        )
+
     def test_search_memories_filters_in_result(self, palace_path, seeded_collection):
         result = search_memories("test", palace_path, wing="project", room="backend")
         assert result["filters"]["wing"] == "project"
@@ -102,7 +120,7 @@ def test_search_memories_handles_none_metadata(self):
             "ids": [["d1", "d2"]],
         }
 
-        def mock_get_collection(path, create=False):
+        def mock_get_collection(path, collection_name=None, create=False):
             # First call: drawers. Second call: closets — raise so hybrid
             # degrades to pure drawer search (the catch block covers it).
             if not hasattr(mock_get_collection, "_called"):
@@ -309,9 +327,9 @@ def test_search_applies_bm25_hybrid_rerank(self, capsys):
         captured = capsys.readouterr()
         first_block, _, _ = captured.out.partition("[2]")
         # Lexical match must rank first
-        assert (
-            "b.md" in first_block
-        ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
+        assert "b.md" in first_block, (
+            f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
+        )
         # Non-zero bm25 reported
         assert "bm25=" in first_block
         assert "bm25=0.0" not in first_block

From e9aee194335a446787f2e2a007ff28e11269f385 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 09:10:22 -0300
Subject: [PATCH 086/127] fix(tests): apply ruff format after rebase resolution

The collection_name plumbing rebase produced a few unformatted blocks
in test_mcp_server.py and test_searcher.py; bringing them in line with
the 0.4.x CI pin so test-windows / lint stay green.
---
 tests/test_mcp_server.py | 6 +++---
 tests/test_searcher.py   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 613cc23..1f47192 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -523,9 +523,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert result1["drawer_id"] != result2["drawer_id"], (
-            "Documents with shared header but different content must have distinct drawer IDs"
-        )
+        assert (
+            result1["drawer_id"] != result2["drawer_id"]
+        ), "Documents with shared header but different content must have distinct drawer IDs"
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
diff --git a/tests/test_searcher.py b/tests/test_searcher.py
index 60bef9f..f4d46a0 100644
--- a/tests/test_searcher.py
+++ b/tests/test_searcher.py
@@ -327,9 +327,9 @@ def test_search_applies_bm25_hybrid_rerank(self, capsys):
         captured = capsys.readouterr()
         first_block, _, _ = captured.out.partition("[2]")
         # Lexical match must rank first
-        assert "b.md" in first_block, (
-            f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
-        )
+        assert (
+            "b.md" in first_block
+        ), f"expected lexical match 'b.md' at rank 1, got:\n{captured.out}"
         # Non-zero bm25 reported
         assert "bm25=" in first_block
         assert "bm25=0.0" not in first_block

From 5134a635ed5cfe819ccc479f58b256bd3e19af45 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 11:52:58 -0300
Subject: [PATCH 087/127] fix(repair): run SQLite integrity preflight before
 chromadb open
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#1364 added the SQLite quick_check preflight to rebuild_index, but
placed it AFTER backend.get_collection(...). On a SQLite-corrupt
palace, chromadb's rust binding raises pyo3_runtime.PanicException —
which is not a regular Exception subclass — so it propagates past the
existing `except Exception` handlers and the user sees a 30-line stack
trace instead of the friendly abort message #1364 was designed to
deliver. Reproduced with `mempalace repair --yes` against a palace
whose chroma.sqlite3 has 4 mangled pages: pre-fix, panic; post-fix,
the clean abort message and exit code 1.

Two changes:

- mempalace/cli.py cmd_repair: run sqlite_integrity_errors() right
  after the basic palace-existence check, BEFORE the max_seq_id
  preflight (which itself opens sqlite3) and BEFORE backend =
  ChromaBackend(). Exit non-zero so unattended scripts and CI gates
  see the failure.

- mempalace/repair.py rebuild_index: same move at the function level
  for direct callers (tests, MCP) that bypass cmd_repair.

The new test test_rebuild_index_runs_sqlite_preflight_before_chromadb_open
uses a real chromadb-built palace (no ChromaBackend mock) plus a
real corrupt SQLite (16 KB of mangled pages) so the ordering is
exercised end-to-end. The previously-shipping test for the abort path
mocked both the backend and sqlite_integrity_errors, which is why the
ordering bug shipped CI-green.

Six existing test_cli.py cmd_repair tests used `(palace_dir /
"chroma.sqlite3").write_text("db")` to fake the SQLite file. The new
preflight correctly fails quick_check on those 2-byte stubs, so the
tests now create empty real SQLite DBs the same way the test_repair.py
fixtures already do.
---
 mempalace/cli.py     | 14 ++++++++++++
 mempalace/repair.py  | 16 +++++++++-----
 tests/test_cli.py    | 13 +++++------
 tests/test_repair.py | 51 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 14db5a8..6a531e7 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -662,6 +662,8 @@ def cmd_repair(args):
         _rebuild_collection_via_temp,
         check_extraction_safety,
         maybe_repair_poisoned_max_seq_id_before_rebuild,
+        print_sqlite_integrity_abort,
+        sqlite_integrity_errors,
     )
 
     config = MempalaceConfig()
@@ -743,6 +745,18 @@ def cmd_repair(args):
         print(f"\n No palace database found at {db_path}")
         return
 
+    # Run the SQLite integrity preflight before any chromadb client open.
+    # ChromaDB's rust binding raises pyo3_runtime.PanicException on a
+    # malformed page, which is not a regular Exception subclass and
+    # propagates past the try/except below — the user gets a 30-line
+    # stack trace instead of the friendly abort message. Run quick_check
+    # here so we can surface the clear recovery instructions and exit
+    # cleanly before chromadb's compactor touches the disk.
+    sqlite_errors = sqlite_integrity_errors(palace_path)
+    if sqlite_errors:
+        print_sqlite_integrity_abort(palace_path, sqlite_errors)
+        sys.exit(1)
+
     preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
         palace_path,
         backup=getattr(args, "backup", True),
diff --git a/mempalace/repair.py b/mempalace/repair.py
index 6e170ef..dd4c46a 100644
--- a/mempalace/repair.py
+++ b/mempalace/repair.py
@@ -633,6 +633,17 @@ def rebuild_index(
     print(f"{'=' * 55}\n")
     print(f" Palace: {palace_path}")
 
+    # Run the SQLite integrity preflight before any chromadb client open.
+    # ChromaDB's rust binding raises pyo3_runtime.PanicException (which is
+    # not a regular Exception subclass) on a malformed page, propagating
+    # past the try/except around get_collection below. Catching the
+    # corruption here lets us surface the clear recovery instructions and
+    # exit cleanly before chromadb's compactor touches the disk.
+    sqlite_errors = sqlite_integrity_errors(palace_path)
+    if sqlite_errors:
+        print_sqlite_integrity_abort(palace_path, sqlite_errors)
+        return
+
     preflight = maybe_repair_poisoned_max_seq_id_before_rebuild(
         palace_path,
         assume_yes=True,
@@ -676,11 +687,6 @@ def rebuild_index(
         print(e.message)
         return
 
-    sqlite_errors = sqlite_integrity_errors(palace_path)
-    if sqlite_errors:
-        print_sqlite_integrity_abort(palace_path, sqlite_errors)
-        return
-
     # Back up ONLY the SQLite database, not the bloated HNSW files
     sqlite_path = os.path.join(palace_path, "chroma.sqlite3")
     backup_path = sqlite_path + ".backup"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0b61b0c..fa5680d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,6 +2,7 @@
 
 import argparse
 import shlex
+import sqlite3
 import sys
 from pathlib import Path
 from unittest.mock import MagicMock, call, patch
@@ -774,7 +775,7 @@ def test_cmd_repair_requires_palace_database(mock_config_cls, tmp_path, capsys):
 def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
@@ -790,7 +791,7 @@ def test_cmd_repair_error_reading(mock_config_cls, tmp_path, capsys):
 def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
@@ -807,7 +808,7 @@ def test_cmd_repair_zero_drawers(mock_config_cls, tmp_path, capsys):
 def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
@@ -843,7 +844,7 @@ def test_cmd_repair_success(mock_config_cls, tmp_path, capsys):
 def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "custom_drawers"
     args = argparse.Namespace(palace=None, yes=True)
@@ -882,7 +883,7 @@ def test_cmd_repair_uses_configured_collection(mock_config_cls, tmp_path, capsys
 def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None, yes=True)
@@ -916,7 +917,7 @@ def test_cmd_repair_restores_backup_on_live_rebuild_failure(mock_config_cls, tmp
 def test_cmd_repair_aborts_without_confirmation(mock_config_cls, tmp_path, capsys):
     palace_dir = tmp_path / "palace"
     palace_dir.mkdir()
-    (palace_dir / "chroma.sqlite3").write_text("db")
+    sqlite3.connect(str(palace_dir / "chroma.sqlite3")).close()
     mock_config_cls.return_value.palace_path = str(palace_dir)
     mock_config_cls.return_value.collection_name = "mempalace_drawers"
     args = argparse.Namespace(palace=None)
diff --git a/tests/test_repair.py b/tests/test_repair.py
index dda83ec..264561f 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -1153,6 +1153,57 @@ def test_rebuild_index_aborts_on_sqlite_integrity_errors_before_delete_collectio
     mock_shutil.copy2.assert_not_called()
 
 
+def test_rebuild_index_runs_sqlite_preflight_before_chromadb_open(tmp_path, capsys):
+    """The SQLite integrity preflight must run BEFORE backend.get_collection.
+
+    chromadb's rust binding raises pyo3_runtime.PanicException (which is not
+    a regular Exception subclass) on a malformed page, so any get_collection
+    call against a corrupt SQLite propagates past `except Exception` handlers
+    and produces a 30-line stack trace instead of the friendly abort message.
+    Regression test for the ordering bug where the preflight was placed after
+    the chromadb client open and therefore never reached on the cases it was
+    designed to catch (#1364 follow-up).
+    """
+    palace = tmp_path / "palace"
+    palace.mkdir()
+
+    # Build a real chromadb palace with one drawer so chroma.sqlite3 exists
+    # at full schema size, then mangle several middle pages so PRAGMA
+    # quick_check fails with "disk image is malformed". This matches the
+    # production failure mode users hit in #1362 / #1364.
+    from mempalace.backends.chroma import ChromaBackend
+
+    backend = ChromaBackend()
+    try:
+        col = backend.create_collection(str(palace), "mempalace_drawers")
+        col.upsert(
+            ids=["d1"],
+            documents=["doc"],
+            metadatas=[{"wing": "w", "room": "r"}],
+        )
+    finally:
+        backend.close()
+
+    sqlite_path = palace / "chroma.sqlite3"
+    pre_size = sqlite_path.stat().st_size
+    assert pre_size > 16384, "need a multi-page sqlite db to mangle"
+
+    with open(sqlite_path, "r+b") as f:
+        f.seek(40960)  # page 10
+        f.write(b"\xde\xad\xbe\xef" * 4096)  # 16 KB of garbage
+
+    # No chromadb mocks: rebuild_index must reach sqlite_integrity_errors
+    # before any code path that opens a chromadb client. If the preflight
+    # comes too late, the test fails with pyo3_runtime.PanicException
+    # instead of returning cleanly.
+    repair.rebuild_index(palace_path=str(palace))
+
+    out = capsys.readouterr().out
+    assert "SQLite-layer corruption detected before repair rebuild" in out
+    assert "PRAGMA quick_check" in out
+    assert "disk image is malformed" in out
+
+
 def test_max_seq_id_preflight_preserves_embeddings_queue(tmp_path):
     """#1295: default repair preflight must not drop queued writes."""
 

From 7b151039c9479b15df9c27ada667e7f919acbe62 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:07:54 -0300
Subject: [PATCH 088/127] test(repair): page-align corruption offset in
 preflight regression test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address Copilot review on #1403: the test seeked unconditionally to
offset 40960 with only `pre_size > 16384` as a guard. If pre_size sat
between 16384 and 40960 + 16384 = 57344 (e.g., on a chromadb version
that allocated fewer pages on init, or a future schema change), the
seek would extend the file with zero-padding and the original pages
would stay intact — quick_check would still pass on the (untouched)
real data, and the regression guard would silently skip detecting a
preflight-ordering regression.

Compute the offset from pre_size, page-aligned, with explicit asserts
that the file is large enough to mangle 4 pages without truncating
the header or extending past EOF.
---
 tests/test_repair.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/tests/test_repair.py b/tests/test_repair.py
index 264561f..37651ba 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -1186,11 +1186,31 @@ def test_rebuild_index_runs_sqlite_preflight_before_chromadb_open(tmp_path, caps
 
     sqlite_path = palace / "chroma.sqlite3"
     pre_size = sqlite_path.stat().st_size
-    assert pre_size > 16384, "need a multi-page sqlite db to mangle"
+
+    # Compute a page-aligned corruption offset that's always inside the
+    # existing file. SQLite uses 4 KB pages by default; we mangle 4 pages
+    # somewhere in the middle, skipping at least the first 2 pages
+    # (header + root) so the file still opens. Without clamping to the
+    # actual file size, a seek past EOF on r+b mode would silently
+    # extend the file with zero-padding and leave the original pages
+    # intact — quick_check would still pass, and the regression guard
+    # would skip the bug.
+    PAGE = 4096
+    CORRUPT_BYTES = 16384  # 4 pages
+    HEADER_GUARD = PAGE * 2  # leave header + root pages intact
+    assert (
+        pre_size >= HEADER_GUARD + CORRUPT_BYTES
+    ), f"sqlite db too small to mangle without truncating: {pre_size} bytes"
+    # Round (pre_size - CORRUPT_BYTES) down to a page boundary so we
+    # mangle whole pages. Cap at offset 40960 (page 10) for stable
+    # diagnostics across SQLite versions that may grow the file.
+    max_offset = (pre_size - CORRUPT_BYTES) & ~(PAGE - 1)
+    corrupt_offset = min(40960, max_offset)
+    assert corrupt_offset >= HEADER_GUARD, f"corruption offset {corrupt_offset} too close to header"
 
     with open(sqlite_path, "r+b") as f:
-        f.seek(40960)  # page 10
-        f.write(b"\xde\xad\xbe\xef" * 4096)  # 16 KB of garbage
+        f.seek(corrupt_offset)
+        f.write(b"\xde\xad\xbe\xef" * (CORRUPT_BYTES // 4))
 
     # No chromadb mocks: rebuild_index must reach sqlite_integrity_errors
     # before any code path that opens a chromadb client. If the preflight

From 2a0ed0cb8f8bf7458be7ce5dc494b62ba0c39510 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:38:39 -0300
Subject: [PATCH 089/127] fix(closet_llm): retry _call_llm on JSONDecodeError
 instead of bailing

The retry loop already backs off on HTTP 429/503 and rate-limit-shaped
exceptions, but JSONDecodeError exited on the first failure. Local LLM
runtimes occasionally produce malformed JSON (truncated streams, partial
chunks under load), and the retry was effectively dead for that path.

Mirror the 429/503 branch: sleep with exponential backoff and continue
through all 3 attempts, only returning None after the final failure.

Closes #1155
---
 mempalace/closet_llm.py  |  3 +++
 tests/test_closet_llm.py | 26 +++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/mempalace/closet_llm.py b/mempalace/closet_llm.py
index 50000c8..a85d517 100644
--- a/mempalace/closet_llm.py
+++ b/mempalace/closet_llm.py
@@ -169,6 +169,9 @@ def _call_llm(cfg: LLMConfig, source_file: str, wing: str, room: str, content: s
             parsed = json.loads(text)
             return parsed, payload.get("usage")
         except json.JSONDecodeError:
+            if attempt < 2:
+                time.sleep(2**attempt)
+                continue
             return None, None
         except urllib.error.HTTPError as e:
             # 429 / 503 = retry with backoff
diff --git a/tests/test_closet_llm.py b/tests/test_closet_llm.py
index 3a0e84e..0255ee8 100644
--- a/tests/test_closet_llm.py
+++ b/tests/test_closet_llm.py
@@ -196,9 +196,33 @@ def fake_urlopen(req, timeout=None):
                 }
             )
 
-        with patch("urllib.request.urlopen", side_effect=fake_urlopen):
+        with (
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+            patch("mempalace.closet_llm.time.sleep"),
+        ):
+            parsed, usage = _call_llm(cfg, "/tmp/x", "w", "r", "c")
+        assert parsed is None
+
+    def test_retries_on_json_decode_error(self):
+        cfg = self._make_cfg()
+        call_count = {"n": 0}
+
+        def fake_urlopen(req, timeout=None):
+            call_count["n"] += 1
+            return _FakeResp(
+                {
+                    "choices": [{"message": {"content": "not json at all"}}],
+                    "usage": {"prompt_tokens": 1, "completion_tokens": 1},
+                }
+            )
+
+        with (
+            patch("urllib.request.urlopen", side_effect=fake_urlopen),
+            patch("mempalace.closet_llm.time.sleep"),
+        ):
             parsed, usage = _call_llm(cfg, "/tmp/x", "w", "r", "c")
         assert parsed is None
+        assert call_count["n"] == 3
 
 
 # ── regenerate_closets error paths ───────────────────────────────────────

From 40e2c8b056b6954629da4dc9841a7a2d44360dab Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:40:26 -0300
Subject: [PATCH 090/127] fix(exporter): refuse symlinks at export targets

A symlink pre-placed at the export output_dir or any wing subdirectory
would redirect markdown writes to wherever the symlink points. The
miner already rejects symlinked inputs via Path.is_symlink(); the
exporter should apply the same caution to outputs.

Add _reject_symlink() helper and call it before makedirs on both
output_dir and each wing_dir. Refusal raises ValueError with a clear
message rather than silently falling through.

Closes #1156
---
 mempalace/exporter.py  | 16 ++++++++++++++++
 tests/test_exporter.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+)

diff --git a/mempalace/exporter.py b/mempalace/exporter.py
index 4b903b0..a19181b 100644
--- a/mempalace/exporter.py
+++ b/mempalace/exporter.py
@@ -26,6 +26,20 @@ def _safe_path_component(name: str) -> str:
     return name or "unknown"
 
 
+def _reject_symlink(path: str, label: str) -> None:
+    """Refuse to write into a path that is itself a symlink.
+
+    Defense-in-depth: a pre-placed symlink at the export target would
+    redirect writes to wherever it points (e.g., system directories).
+    Mirrors the miner's input-side caution.
+    """
+    if os.path.islink(path):
+        raise ValueError(
+            f"refusing to export: {label} is a symbolic link ({path!r}). "
+            f"Remove the symlink or choose a different output path."
+        )
+
+
 def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -> dict:
     """Export all palace drawers as markdown files organized by wing/room.
 
@@ -48,6 +62,7 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
         print("  Palace is empty — nothing to export.")
         return {"wings": 0, "rooms": 0, "drawers": 0}
 
+    _reject_symlink(output_dir, "output_dir")
     os.makedirs(output_dir, exist_ok=True)
     try:
         os.chmod(output_dir, 0o700)
@@ -89,6 +104,7 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
             safe_wing = _safe_path_component(wing)
             wing_dir = os.path.join(output_dir, safe_wing)
             if wing_dir not in created_wing_dirs:
+                _reject_symlink(wing_dir, f"wing directory {safe_wing!r}")
                 os.makedirs(wing_dir, exist_ok=True)
                 try:
                     os.chmod(wing_dir, 0o700)
diff --git a/tests/test_exporter.py b/tests/test_exporter.py
index 0597ec1..e4d4ee7 100644
--- a/tests/test_exporter.py
+++ b/tests/test_exporter.py
@@ -134,3 +134,45 @@ def test_export_empty_palace():
         assert stats == {"wings": 0, "rooms": 0, "drawers": 0}
     finally:
         shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_export_refuses_symlinked_output_dir():
+    """A symlink at the output path must not be followed (defense-in-depth)."""
+    import pytest
+
+    tmpdir = tempfile.mkdtemp()
+    try:
+        palace_path = _setup_palace(tmpdir)
+        decoy_target = os.path.join(tmpdir, "decoy_target")
+        os.makedirs(decoy_target)
+        output_dir = os.path.join(tmpdir, "export")
+        os.symlink(decoy_target, output_dir)
+
+        with pytest.raises(ValueError, match="symbolic link"):
+            export_palace(palace_path, output_dir)
+
+        # Decoy target must remain empty — nothing followed the symlink.
+        assert os.listdir(decoy_target) == []
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_export_refuses_symlinked_wing_dir():
+    """A symlink pre-placed at a wing subdirectory must also be refused."""
+    import pytest
+
+    tmpdir = tempfile.mkdtemp()
+    try:
+        palace_path = _setup_palace(tmpdir)
+        decoy_target = os.path.join(tmpdir, "decoy_target")
+        os.makedirs(decoy_target)
+        output_dir = os.path.join(tmpdir, "export")
+        os.makedirs(output_dir)
+        os.symlink(decoy_target, os.path.join(output_dir, "alpha"))
+
+        with pytest.raises(ValueError, match="symbolic link"):
+            export_palace(palace_path, output_dir)
+
+        assert os.listdir(decoy_target) == []
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)

From 0d1c1fbcaab751a1e7f8d992debaf0c5976904de Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:42:02 -0300
Subject: [PATCH 091/127] fix(diary): detect same-size edits via content hash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The skip-if-unchanged check compared byte length only, so any in-place
edit preserving total length (typo fix "teh"→"the", word swap) was
silently dropped — a verbatim-storage violation: the user's actual
words never reached the palace.

Switch the gate to sha256(text). State entries gain a "content_hash"
field; the legacy size-only path is preserved when prev_hash is missing
so a post-upgrade run does not re-ingest every untouched diary.

Closes #925
---
 mempalace/diary_ingest.py | 19 +++++++++++++++----
 tests/test_closets.py     | 20 ++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/mempalace/diary_ingest.py b/mempalace/diary_ingest.py
index 503f0c0..7939dda 100644
--- a/mempalace/diary_ingest.py
+++ b/mempalace/diary_ingest.py
@@ -120,12 +120,22 @@ def ingest_diaries(
             continue
         date_str = date_match.group(1)
 
-        # Skip if content hasn't changed
+        # Skip if content hasn't changed. Hash-based — size alone false-negatives
+        # on same-length edits (e.g. "teh" → "the"), silently dropping real edits.
         state_key = f"{wing}|{diary_path.name}"
-        prev_size = state.get(state_key, {}).get("size", 0)
+        prev_entry = state.get(state_key, {})
+        prev_hash = prev_entry.get("content_hash")
+        prev_size = prev_entry.get("size", 0)
         curr_size = len(text)
-        if curr_size == prev_size and not force:
-            continue
+        curr_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
+        if not force:
+            if prev_hash is not None:
+                if curr_hash == prev_hash:
+                    continue
+            elif curr_size == prev_size and prev_size > 0:
+                # Legacy state without content_hash: keep size-based skip so a
+                # post-upgrade run doesn't re-ingest every untouched diary.
+                continue
 
         now_iso = datetime.now(timezone.utc).isoformat()
         drawer_id = _diary_drawer_id(wing, date_str)
@@ -184,6 +194,7 @@ def ingest_diaries(
 
             state[state_key] = {
                 "size": curr_size,
+                "content_hash": curr_hash,
                 "entry_count": len(entries),
                 "ingested_at": now_iso,
             }
diff --git a/tests/test_closets.py b/tests/test_closets.py
index 976086d..bd996e1 100644
--- a/tests/test_closets.py
+++ b/tests/test_closets.py
@@ -605,6 +605,26 @@ def test_ingest_skips_unchanged_on_second_run(self, tmp_path):
         result = ingest_diaries(str(diary_dir), str(palace_dir))
         assert result["days_updated"] == 0
 
+    def test_ingest_detects_same_size_content_edit(self, tmp_path):
+        # Regression #925: the prior skip-check compared byte length only, so
+        # any in-place edit preserving total length (e.g. typo fix "teh"→"the")
+        # was silently dropped. Content-hash check must catch it.
+        diary_dir = tmp_path / "diaries"
+        diary_dir.mkdir()
+        diary_file = diary_dir / "2026-04-13.md"
+        original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n"
+        edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n"
+        assert len(original) == len(edited), "test setup: edited content must be same length"
+        diary_file.write_text(original)
+        palace_dir = tmp_path / "palace"
+
+        from mempalace.diary_ingest import ingest_diaries
+
+        ingest_diaries(str(diary_dir), str(palace_dir), force=True)
+        diary_file.write_text(edited)
+        result = ingest_diaries(str(diary_dir), str(palace_dir))
+        assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
+
     def test_state_file_lives_outside_diary_dir(self, tmp_path):
         # Regression: the original implementation wrote
         # ``.diary_ingest_state.json`` *inside* the user's diary directory,

From 8e21b5abd48a97830bce4fb3c28584d6e9b2caa5 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:49:27 -0300
Subject: [PATCH 092/127] test(closet_llm): use _ for unused return values per
 copilot review

---
 tests/test_closet_llm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_closet_llm.py b/tests/test_closet_llm.py
index 0255ee8..905d28b 100644
--- a/tests/test_closet_llm.py
+++ b/tests/test_closet_llm.py
@@ -200,7 +200,7 @@ def fake_urlopen(req, timeout=None):
             patch("urllib.request.urlopen", side_effect=fake_urlopen),
             patch("mempalace.closet_llm.time.sleep"),
         ):
-            parsed, usage = _call_llm(cfg, "/tmp/x", "w", "r", "c")
+            parsed, _ = _call_llm(cfg, "/tmp/x", "w", "r", "c")
         assert parsed is None
 
     def test_retries_on_json_decode_error(self):
@@ -220,7 +220,7 @@ def fake_urlopen(req, timeout=None):
             patch("urllib.request.urlopen", side_effect=fake_urlopen),
             patch("mempalace.closet_llm.time.sleep"),
         ):
-            parsed, usage = _call_llm(cfg, "/tmp/x", "w", "r", "c")
+            parsed, _ = _call_llm(cfg, "/tmp/x", "w", "r", "c")
         assert parsed is None
         assert call_count["n"] == 3
 

From 75452380a896959b79a25eb99061db8c7d95a001 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:51:47 -0300
Subject: [PATCH 093/127] fix(exporter): refuse symlinks at file targets and
 skip tests on Windows

Address Copilot review on #1156:

- Per-file symlink check via new _safe_open_for_write() helper. Uses
  O_NOFOLLOW on POSIX (close TOCTOU window between islink check and
  open) and falls back to islink + open on Windows. Applied to room
  files and index.md, mirroring the existing dir-level check.
- Tests now wrap os.symlink() in _try_symlink_or_skip() so Windows
  without Developer Mode and restricted CI sandboxes skip rather than
  hard-fail. Added two regression tests for the file-level cases
  (room file, index.md).
---
 mempalace/exporter.py  | 29 +++++++++++++++++--
 tests/test_exporter.py | 63 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/mempalace/exporter.py b/mempalace/exporter.py
index a19181b..2b874c6 100644
--- a/mempalace/exporter.py
+++ b/mempalace/exporter.py
@@ -11,6 +11,7 @@
 regardless of palace size.
 """
 
+import errno
 import os
 import re
 from collections import defaultdict
@@ -40,6 +41,30 @@ def _reject_symlink(path: str, label: str) -> None:
         )
 
 
+def _safe_open_for_write(path: str, mode: str, encoding: str = "utf-8"):
+    """Open a file for writing, refusing to follow a symlink at the target path.
+
+    On POSIX (O_NOFOLLOW available) the open itself fails with ELOOP if path is
+    a symlink — closing the TOCTOU window between an islink check and the open.
+    On platforms without O_NOFOLLOW (Windows), pre-checks ``os.path.islink``,
+    which is narrower than no check at all.
+    """
+    o_nofollow = getattr(os, "O_NOFOLLOW", 0)
+    if o_nofollow:
+        flags = os.O_WRONLY | os.O_CREAT | o_nofollow
+        flags |= os.O_APPEND if "a" in mode else os.O_TRUNC
+        try:
+            fd = os.open(path, flags, 0o600)
+        except OSError as e:
+            if e.errno == errno.ELOOP:
+                raise ValueError(f"refusing to write: {path!r} is a symbolic link.") from None
+            raise
+        return os.fdopen(fd, mode, encoding=encoding)
+    if os.path.islink(path):
+        raise ValueError(f"refusing to write: {path!r} is a symbolic link.")
+    return open(path, mode, encoding=encoding)
+
+
 def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -> dict:
     """Export all palace drawers as markdown files organized by wing/room.
 
@@ -118,7 +143,7 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
                 key = (wing, room)
                 is_new = key not in opened_rooms
 
-                with open(room_path, "a" if not is_new else "w", encoding="utf-8") as f:
+                with _safe_open_for_write(room_path, "a" if not is_new else "w") as f:
                     if is_new:
                         f.write(f"# {wing} / {room}\n\n")
                         opened_rooms.add(key)
@@ -168,7 +193,7 @@ def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -
     index_lines.append("")
 
     index_path = os.path.join(output_dir, "index.md")
-    with open(index_path, "w", encoding="utf-8") as f:
+    with _safe_open_for_write(index_path, "w") as f:
         f.write("\n".join(index_lines))
 
     stats = {
diff --git a/tests/test_exporter.py b/tests/test_exporter.py
index e4d4ee7..6709339 100644
--- a/tests/test_exporter.py
+++ b/tests/test_exporter.py
@@ -136,6 +136,22 @@ def test_export_empty_palace():
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
+def _try_symlink_or_skip(target: str, link: str):
+    """Create a symlink, skipping the test if the runtime forbids it.
+
+    Windows without Developer Mode/admin and some restricted CI sandboxes
+    refuse os.symlink with OSError or NotImplementedError. The exporter
+    hardening is meaningful only where symlinks can be created at all, so
+    skipping is preferable to a hard failure.
+    """
+    import pytest
+
+    try:
+        os.symlink(target, link)
+    except (OSError, NotImplementedError) as e:
+        pytest.skip(f"symlink creation not supported in this environment: {e}")
+
+
 def test_export_refuses_symlinked_output_dir():
     """A symlink at the output path must not be followed (defense-in-depth)."""
     import pytest
@@ -146,7 +162,7 @@ def test_export_refuses_symlinked_output_dir():
         decoy_target = os.path.join(tmpdir, "decoy_target")
         os.makedirs(decoy_target)
         output_dir = os.path.join(tmpdir, "export")
-        os.symlink(decoy_target, output_dir)
+        _try_symlink_or_skip(decoy_target, output_dir)
 
         with pytest.raises(ValueError, match="symbolic link"):
             export_palace(palace_path, output_dir)
@@ -168,7 +184,7 @@ def test_export_refuses_symlinked_wing_dir():
         os.makedirs(decoy_target)
         output_dir = os.path.join(tmpdir, "export")
         os.makedirs(output_dir)
-        os.symlink(decoy_target, os.path.join(output_dir, "alpha"))
+        _try_symlink_or_skip(decoy_target, os.path.join(output_dir, "alpha"))
 
         with pytest.raises(ValueError, match="symbolic link"):
             export_palace(palace_path, output_dir)
@@ -176,3 +192,46 @@ def test_export_refuses_symlinked_wing_dir():
         assert os.listdir(decoy_target) == []
     finally:
         shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_export_refuses_symlinked_room_file():
+    """A symlink pre-placed at a room file path must not be followed."""
+    import pytest
+
+    tmpdir = tempfile.mkdtemp()
+    try:
+        palace_path = _setup_palace(tmpdir)
+        decoy_target = os.path.join(tmpdir, "decoy_target.md")
+        Path(decoy_target).write_text("untouched\n", encoding="utf-8")
+        output_dir = os.path.join(tmpdir, "export")
+        os.makedirs(os.path.join(output_dir, "alpha"))
+        _try_symlink_or_skip(decoy_target, os.path.join(output_dir, "alpha", "backend.md"))
+
+        with pytest.raises(ValueError, match="symbolic link"):
+            export_palace(palace_path, output_dir)
+
+        # Decoy file must remain unchanged — open did not follow the symlink.
+        assert Path(decoy_target).read_text(encoding="utf-8") == "untouched\n"
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)
+
+
+def test_export_refuses_symlinked_index_file():
+    """A symlink pre-placed at output_dir/index.md must not be followed."""
+    import pytest
+
+    tmpdir = tempfile.mkdtemp()
+    try:
+        palace_path = _setup_palace(tmpdir)
+        decoy_target = os.path.join(tmpdir, "decoy_index.md")
+        Path(decoy_target).write_text("untouched\n", encoding="utf-8")
+        output_dir = os.path.join(tmpdir, "export")
+        os.makedirs(output_dir)
+        _try_symlink_or_skip(decoy_target, os.path.join(output_dir, "index.md"))
+
+        with pytest.raises(ValueError, match="symbolic link"):
+            export_palace(palace_path, output_dir)
+
+        assert Path(decoy_target).read_text(encoding="utf-8") == "untouched\n"
+    finally:
+        shutil.rmtree(tmpdir, ignore_errors=True)

From 2ff6283b32f2ed5825122a1e3284cee2fee8b85c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 12:54:09 -0300
Subject: [PATCH 094/127] fix(diary): rebuild closets on hash change + backfill
 legacy state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address Copilot review on #925:

- Full closet rebuild whenever the content hash differs from prior
  state, not only on entry-count growth. Without this, an in-place
  edit (same entry count, different body) updated the drawer but
  left the closet/search index stale — defeats the verbatim guarantee
  at the search layer even if the drawer is correct.
- Legacy size-only skip path now records the computed content_hash
  back into state so subsequent runs use the strict hash check
  instead of remaining on the size-only path indefinitely.
- Test updates: typo direction in the regression test now matches the
  comment (typo "Teh" → fix "The"), assertion now also checks the
  closet collection reflects the edit, and a new test exercises the
  legacy-state backfill path.
---
 mempalace/diary_ingest.py | 19 ++++++++----
 tests/test_closets.py     | 64 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/mempalace/diary_ingest.py b/mempalace/diary_ingest.py
index 7939dda..e6ffe42 100644
--- a/mempalace/diary_ingest.py
+++ b/mempalace/diary_ingest.py
@@ -133,10 +133,16 @@ def ingest_diaries(
                 if curr_hash == prev_hash:
                     continue
             elif curr_size == prev_size and prev_size > 0:
-                # Legacy state without content_hash: keep size-based skip so a
-                # post-upgrade run doesn't re-ingest every untouched diary.
+                # Legacy state without content_hash: keep size-based skip but
+                # backfill the hash so future runs use the strict check.
+                state[state_key] = {**prev_entry, "content_hash": curr_hash}
                 continue
 
+        # An in-place edit (same entry count, different content) means existing
+        # closets are stale. Force a full rebuild whenever the hash changes,
+        # not only on entry-count growth.
+        content_changed = prev_hash is not None and curr_hash != prev_hash
+
         now_iso = datetime.now(timezone.utc).isoformat()
         drawer_id = _diary_drawer_id(wing, date_str)
         entities = _extract_entities_for_metadata(text)
@@ -163,7 +169,8 @@ def ingest_diaries(
 
             entries = _split_entries(text)
             prev_entry_count = state.get(state_key, {}).get("entry_count", 0)
-            new_entries = entries if force else entries[prev_entry_count:]
+            full_rebuild = force or content_changed
+            new_entries = entries if full_rebuild else entries[prev_entry_count:]
 
             if new_entries:
                 all_lines = []
@@ -185,9 +192,9 @@ def ingest_diaries(
                     }
                     if entities:
                         closet_meta["entities"] = entities
-                    # On a force rebuild, wipe any leftover numbered closets
-                    # from a longer prior run before re-writing.
-                    if force:
+                    # On any full rebuild (force or detected content edit),
+                    # wipe leftover closets from a prior run before re-writing.
+                    if full_rebuild:
                         purge_file_closets(closets_col, source_file)
                     n = upsert_closet_lines(closets_col, closet_id_base, all_lines, closet_meta)
                     closets_created += n
diff --git a/tests/test_closets.py b/tests/test_closets.py
index bd996e1..37a78f4 100644
--- a/tests/test_closets.py
+++ b/tests/test_closets.py
@@ -23,6 +23,7 @@
     cross-diary collisions, force=True purges leftover closets.
 """
 
+import hashlib
 import json
 import multiprocessing
 import os
@@ -607,13 +608,16 @@ def test_ingest_skips_unchanged_on_second_run(self, tmp_path):
 
     def test_ingest_detects_same_size_content_edit(self, tmp_path):
         # Regression #925: the prior skip-check compared byte length only, so
-        # any in-place edit preserving total length (e.g. typo fix "teh"→"the")
-        # was silently dropped. Content-hash check must catch it.
+        # any in-place edit preserving total length (typo fix "teh"→"the",
+        # word swap, character reorder) was silently dropped. Content-hash
+        # check must catch the change AND rebuild the searchable closet so
+        # the index does not stay stale while the drawer updates.
         diary_dir = tmp_path / "diaries"
         diary_dir.mkdir()
         diary_file = diary_dir / "2026-04-13.md"
-        original = "# 2026-04-13\n\n## 10:00 — Test\n\nThe quick brown fox jumps over.\n"
-        edited = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh quick brown fox jumps over.\n"
+        # Original has the typo "Teh"; the edit fixes it to "The" — same length.
+        original = "# 2026-04-13\n\n## 10:00 — Test\n\nTeh elaborate jakarta postgres bug.\n"
+        edited = "# 2026-04-13\n\n## 10:00 — Test\n\nThe elaborate jakarta postgres bug.\n"
         assert len(original) == len(edited), "test setup: edited content must be same length"
         diary_file.write_text(original)
         palace_dir = tmp_path / "palace"
@@ -625,6 +629,58 @@ def test_ingest_detects_same_size_content_edit(self, tmp_path):
         result = ingest_diaries(str(diary_dir), str(palace_dir))
         assert result["days_updated"] == 1, "same-size content edit must trigger re-ingest"
 
+        # Drawer must hold the corrected text.
+        drawers = get_collection(str(palace_dir)).get(where={"source_file": str(diary_file)})
+        joined_drawers = "\n".join(drawers["documents"])
+        assert "The elaborate" in joined_drawers
+        assert "Teh elaborate" not in joined_drawers, "drawer still holds pre-edit content"
+
+        # And the closet (search index) must reflect the edit too — not just the
+        # drawer. Otherwise searches would surface stale text.
+        closets = get_closets_collection(str(palace_dir)).get(
+            where={"source_file": str(diary_file)}
+        )
+        joined_closets = "\n".join(closets["documents"])
+        assert "Teh elaborate" not in joined_closets, "closet index still holds stale content"
+
+    def test_legacy_state_backfills_content_hash(self, tmp_path):
+        # Upgraded users can carry legacy state entries without ``content_hash``.
+        # Same-size skip is preserved for that one run, but the hash must be
+        # recorded so the strict check engages on subsequent runs.
+        diary_dir = tmp_path / "diaries"
+        diary_dir.mkdir()
+        diary_file = diary_dir / "2026-04-13.md"
+        text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n"
+        diary_file.write_text(text)
+        palace_dir = tmp_path / "palace"
+
+        from mempalace.diary_ingest import _state_file_for, ingest_diaries
+
+        # Simulate a legacy state file: only size + entry_count, no content_hash.
+        state_file = _state_file_for(str(palace_dir), diary_dir.resolve())
+        state_file.parent.mkdir(parents=True, exist_ok=True)
+        state_file.write_text(
+            json.dumps(
+                {
+                    f"diary|{diary_file.name}": {
+                        "size": len(text),
+                        "entry_count": 1,
+                        "ingested_at": "2026-04-12T00:00:00+00:00",
+                    }
+                }
+            )
+        )
+
+        # Run with no force — size matches, so this should skip ingest.
+        result = ingest_diaries(str(diary_dir), str(palace_dir))
+        assert result["days_updated"] == 0
+
+        # Hash must have been backfilled into state for the next run's strict check.
+        persisted = json.loads(state_file.read_text())
+        entry = persisted[f"diary|{diary_file.name}"]
+        assert "content_hash" in entry, "legacy skip path must record the hash"
+        assert entry["content_hash"] == hashlib.sha256(text.encode("utf-8")).hexdigest()
+
     def test_state_file_lives_outside_diary_dir(self, tmp_path):
         # Regression: the original implementation wrote
         # ``.diary_ingest_state.json`` *inside* the user's diary directory,

From 26bc3d4f912fd27ede8a4618e8e8b8a54438a953 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Thu, 7 May 2026 17:41:19 -0300
Subject: [PATCH 095/127] test(diary): write fixture with explicit utf-8 to fix
 Windows hash assert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_legacy_state_backfills_content_hash failed on test-windows because
Path.write_text without an encoding uses the system locale (cp1252 on
Windows). The em dash was written as 0x97, then read back by
diary_ingest as UTF-8 with errors=replace — round-trip produced
different bytes than the in-Python literal, so the assertion comparing
the persisted hash to sha256(text.encode(utf-8)) diverged.

Pin the write side to encoding=utf-8 so the on-disk bytes match what
diary_ingest decodes. No production change.
---
 tests/test_closets.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_closets.py b/tests/test_closets.py
index 37a78f4..e016d83 100644
--- a/tests/test_closets.py
+++ b/tests/test_closets.py
@@ -650,8 +650,11 @@ def test_legacy_state_backfills_content_hash(self, tmp_path):
         diary_dir = tmp_path / "diaries"
         diary_dir.mkdir()
         diary_file = diary_dir / "2026-04-13.md"
+        # Write explicit UTF-8 so the round-trip matches how diary_ingest reads.
+        # Windows' default text-mode encoding is cp1252; without this the em
+        # dash would round-trip lossy and the hash assertion below would fail.
         text = "# 2026-04-13\n\n## 10:00 — Test\n\nUnchanged body content here.\n"
-        diary_file.write_text(text)
+        diary_file.write_text(text, encoding="utf-8")
         palace_dir = tmp_path / "palace"
 
         from mempalace.diary_ingest import _state_file_for, ingest_diaries

From ead2c5d2997755b7277a38963e2030459c237d6b Mon Sep 17 00:00:00 2001
From: Stephen Coogan <coogie@users.noreply.github.com>
Date: Sat, 18 Apr 2026 18:50:21 +0100
Subject: [PATCH 096/127] fix(miner): use token-boundary matching in
 detect_room
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Substring checks in path/filename routing caused systemic misrouting
in large monorepos — e.g., "views" ⊂ "interviews" sent every file
under views/ to the interviews room. Switch to separator-bounded
token matching (-, _, ., /) via a _name_matches helper, applied to
priority 1 (path parts) and priority 2 (filename).
---
 mempalace/miner.py  | 27 ++++++++++++-
 tests/test_miner.py | 99 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 3 deletions(-)

diff --git a/mempalace/miner.py b/mempalace/miner.py
index 6aeddd4..f9c44e2 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -8,6 +8,7 @@
 """
 
 import os
+import re
 import sys
 import shlex
 import hashlib
@@ -332,6 +333,28 @@ def load_config(project_dir: str) -> dict:
 # FILE ROUTING — which room does this file belong to?
 # =============================================================================
 
+_TOKEN_SPLIT = re.compile(r"[-_./]+")
+
+
+def _tokens(value: str) -> set:
+    """Split ``value`` into lowercased tokens bounded by ``-``, ``_``, ``.`` or ``/``."""
+    return {t for t in _TOKEN_SPLIT.split(value.lower()) if t}
+
+
+def _name_matches(a: str, b: str) -> bool:
+    """Return True when ``a`` and ``b`` match as equal strings or as
+    separator-bounded tokens of each other.
+
+    Prevents incidental substring collisions (e.g., ``"views" in "interviews"``)
+    that a raw ``in`` check would produce, while preserving the intended
+    match for real tokens (e.g., ``"frontend"`` in ``"frontend-app"``).
+    """
+    a = a.lower()
+    b = b.lower()
+    if a == b:
+        return True
+    return b in _tokens(a) or a in _tokens(b)
+
 
 def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -> str:
     """
@@ -351,12 +374,12 @@ def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -
     for part in path_parts[:-1]:  # skip filename itself
         for room in rooms:
             candidates = [room["name"].lower()] + [k.lower() for k in room.get("keywords", [])]
-            if any(part == c or c in part or part in c for c in candidates):
+            if any(_name_matches(part, c) for c in candidates):
                 return room["name"]
 
     # Priority 2: filename matches room name
     for room in rooms:
-        if room["name"].lower() in filename or filename in room["name"].lower():
+        if _name_matches(filename, room["name"]):
             return room["name"]
 
     # Priority 3: keyword scoring from room keywords + name
diff --git a/tests/test_miner.py b/tests/test_miner.py
index 10dd33d..ab053d7 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -7,7 +7,7 @@
 import chromadb
 import yaml
 
-from mempalace.miner import load_config, mine, scan_project, status
+from mempalace.miner import detect_room, load_config, mine, scan_project, status
 from mempalace.palace import NORMALIZE_VERSION, file_already_mined
 
 
@@ -491,6 +491,103 @@ def test_file_already_mined_returns_false_for_stale_normalize_version():
         shutil.rmtree(tmpdir, ignore_errors=True)
 
 
+def test_detect_room_uses_token_boundary_matching(tmp_path):
+    """Path-part routing must not fire on incidental substrings.
+
+    Regression: "views" is a substring of "interviews", so the old
+    substring check routed every file under views/ into a room keyed
+    by "interviews". Token-boundary matching prevents this while still
+    matching real tokens like "frontend" in "frontend-app".
+    """
+    project = tmp_path
+    rooms = [
+        {"name": "billing-page", "keywords": ["billing-page"]},
+        {"name": "interviews", "keywords": ["interviews"]},
+        {"name": "general", "keywords": []},
+    ]
+
+    # views/<X>/... must NOT route to "interviews" on the "views"⊂"interviews" accident
+    view_file = project / "views" / "billing-page" / "Foo.test.tsx"
+    view_file.parent.mkdir(parents=True)
+    view_file.write_text("content")
+    assert detect_room(view_file, "content", rooms, project) == "billing-page"
+
+    # data/interviews/... must route to "interviews" via the real token
+    data_file = project / "data" / "interviews" / "index.ts"
+    data_file.parent.mkdir(parents=True)
+    data_file.write_text("content")
+    assert detect_room(data_file, "content", rooms, project) == "interviews"
+
+
+def test_detect_room_preserves_token_matches(tmp_path):
+    """Real separator-bounded tokens still match in both directions."""
+    project = tmp_path
+    rooms = [
+        {"name": "frontend", "keywords": ["frontend"]},
+        {"name": "general", "keywords": []},
+    ]
+
+    # path part contains keyword as a token
+    f1 = project / "frontend-app" / "main.ts"
+    f1.parent.mkdir(parents=True)
+    f1.write_text("x")
+    assert detect_room(f1, "x", rooms, project) == "frontend"
+
+    # keyword contains path part as a token (reverse direction)
+    rooms2 = [
+        {"name": "data-retention", "keywords": ["data-retention"]},
+        {"name": "general", "keywords": []},
+    ]
+    f2 = project / "data" / "data-retention" / "policy.ts"
+    f2.parent.mkdir(parents=True)
+    f2.write_text("x")
+    assert detect_room(f2, "x", rooms2, project) == "data-retention"
+
+
+def test_detect_room_matches_keyword_distinct_from_name(tmp_path):
+    """Regression: PR #145 — path part must match a keyword even when the
+    room name itself doesn't contain the path part as a token.
+
+    Scenario: a folder named ``docs/`` should route to a room named
+    ``documentation`` that declares ``"docs"`` as a keyword.
+    """
+    project = tmp_path
+    rooms = [
+        {"name": "documentation", "keywords": ["docs"]},
+        {"name": "general", "keywords": []},
+    ]
+
+    f = project / "docs" / "readme.md"
+    f.parent.mkdir(parents=True)
+    f.write_text("x")
+    assert detect_room(f, "x", rooms, project) == "documentation"
+
+
+def test_detect_room_filename_match_uses_token_boundary(tmp_path):
+    """Priority 2 (filename match) must also use token-boundary rules."""
+    project = tmp_path
+    rooms = [
+        {"name": "review", "keywords": []},
+        {"name": "general", "keywords": []},
+    ]
+
+    # "review" is a substring of "reviewmodule" but not a token — should NOT match
+    f1 = project / "reviewmodule.ts"
+    f1.write_text("x")
+    assert detect_room(f1, "x", rooms, project) != "review"
+
+    # "review" IS a token of "review-page" — should match
+    f2 = project / "review-page.ts"
+    f2.write_text("x")
+    assert detect_room(f2, "x", rooms, project) == "review"
+
+    # Dotted filename stems like "Foo.test" split on "." too
+    rooms3 = [{"name": "foo", "keywords": []}, {"name": "general", "keywords": []}]
+    f3 = project / "foo.test.ts"
+    f3.write_text("x")
+    assert detect_room(f3, "x", rooms3, project) == "foo"
+
+
 def test_add_drawer_stamps_normalize_version(tmp_path):
     """Fresh drawers carry the current schema version so future upgrades work."""
     from mempalace.miner import add_drawer

From 3d0d037b8742d35e79cefb396015b069b9af7183 Mon Sep 17 00:00:00 2001
From: Stephen Coogan <coogie@users.noreply.github.com>
Date: Tue, 5 May 2026 12:34:37 +0100
Subject: [PATCH 097/127] docs(changelog): add 3.3.5 entry for detect_room
 substring fix (#1004)

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1972c03..be5c14e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 - **Knowledge-graph triples with `valid_to < valid_from` were silently invisible.** `KnowledgeGraph.query_entity()` filters with `valid_from <= as_of AND valid_to >= as_of`, so an inverted interval matches no `as_of` and the row is durably stored but unreachable — a P0 data-integrity foot-gun any caller that mixes up the two date params can hit. `add_triple()` now rejects inverted intervals at write time with a clear `ValueError` naming both bounds. Open intervals (one bound only) and point-in-time facts (`valid_from == valid_to`) remain accepted unchanged. (#1214)
 - **`ChromaBackend.close_palace()` / `close()` did not release the SQLite file lock.** Evicted clients sat in `_clients` without `close()`, and chromadb 1.5.x retains the rust-side SQLite lock until GC. Reopening the same palace path after `shutil.rmtree` + recreate within one process failed with `SQLITE_READONLY_DBMOVED` (code 1032). New `_close_client()` helper now calls `PersistentClient.close()` (with a try/except fallback for older chromadb) on `close_palace()`, on whole-backend `close()`, and on the `_client()` invalidation path that detects a missing `chroma.sqlite3`. The mtime/inode auto-invalidation branch is intentionally left alone — callers there may still hold a live `ChromaCollection`. (#1067, #1105)
 - **`EntityRegistry.save()` could leave a corrupt or empty `entity_registry.json` on crash.** `Path.write_text()` is not atomic — kernel sees `open('w')` (truncate), `write`, `close`, and any failure between truncate and full-flush (power loss, OOM, FS-full, kill -9) wipes the months-of-mining people/projects map silently (the registry's `load()` swallows `JSONDecodeError`). Save now writes to a sibling `.tmp` in the same directory, `fsync`s, `chmod 0o600`s, then `os.replace()`s into place — atomic on POSIX and Windows. The previous registry stays intact on any crash before the rename returns. (#1215)
+- **`miner.detect_room` bidirectional substring matching caused systemic misrouting.** The priority-1 (path parts) and priority-2 (filename) checks used `c in part or part in c` against room names + keywords, so any token that was an unbounded substring of a room name (or vice versa) matched. Priority-1 iterates left-to-right and returns on first match, so `views/billing-page/src/Foo.test.tsx` routed to an `interviews` room because `"views" in "interviews"` matched before reaching `billing-page`. Both call sites now use a `_name_matches` helper that compares names as equal or as separator-bounded tokens of each other (split on `-`, `_`, `.`, `/`). (#1004, closes #1002)
 - **`mempalace compress` crashed on large palaces.** `regenerate_closets` fetched all closet_llm drawers in a single `col.get()`, which trips `SQLITE_MAX_VARIABLE_NUMBER` on palaces above ~32k drawers. Mirrors the #851 fix in `miner.py`: drawer fetch is now paginated at `batch_size=5000`. Per-source aggregation works across batches, so the LLM regeneration call still groups chunks correctly. (#1073, #1107)
 - **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
 - **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)

From 71804c0aa59965fce2508c2552323a4833f4fcf6 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 00:55:11 -0300
Subject: [PATCH 098/127] fix(hooks): detach Popen children so the hook can
 exit on Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Stop hook spawns mining subprocesses via subprocess.Popen and then
returns. On Windows the parent stays blocked at session end because the
child inherits stdout/stderr handles and the OS waits for them to
release before the parent can exit — the user-visible symptom is the
"running stop hooks... 3/3" spinner hanging for minutes (#1268).

Add _detached_popen_kwargs() helper that returns the right detach knobs
per platform:
- POSIX: start_new_session=True, stdin=DEVNULL, close_fds=True
- Windows: creationflags=DETACHED_PROCESS|CREATE_NEW_PROCESS_GROUP|
  CREATE_BREAKAWAY_FROM_JOB, stdin=DEVNULL, close_fds=True

Apply to all three fire-and-forget Popen sites in hooks_cli:
_spawn_mine, _ingest_transcript, _desktop_toast. Leave _mine_sync's
subprocess.run alone — that path is intentionally synchronous (the
precompact hook must wait for the mine to finish).

Note: the issue body references mempalace-stop.js, which does not exist
in this repo (the plugin ships shell wrappers calling Python). The
mechanism described — child holds parent open via inherited handles —
is universal, so this fix targets the equivalent symptom in our Python
hook path. Will follow up on the upstream JS file with the reporter.
---
 mempalace/hooks_cli.py  | 25 +++++++++++++-
 tests/test_hooks_cli.py | 72 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index 8498103..258f1e0 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -19,6 +19,27 @@
 PALACE_ROOT = Path.home() / ".mempalace"
 
 
+def _detached_popen_kwargs() -> dict:
+    """Kwargs that fully detach a Popen child so the hook process can exit.
+
+    Without these, Windows holds the parent open until the child closes the
+    inherited stdout/stderr handles — manifesting as "Stop hook hangs" at
+    session end (#1268). On POSIX the parent can already exit (orphan
+    reparents to init), but ``start_new_session`` makes the boundary
+    explicit so signals to the hook don't propagate to the background mine.
+    """
+    kwargs: dict = {"stdin": subprocess.DEVNULL, "close_fds": True}
+    if os.name == "nt":
+        flags = 0
+        for name in ("DETACHED_PROCESS", "CREATE_NEW_PROCESS_GROUP", "CREATE_BREAKAWAY_FROM_JOB"):
+            flags |= getattr(subprocess, name, 0)
+        if flags:
+            kwargs["creationflags"] = flags
+    else:
+        kwargs["start_new_session"] = True
+    return kwargs
+
+
 def _palace_root_exists() -> bool:
     """User-removable kill-switch.
 
@@ -285,7 +306,7 @@ def _spawn_mine(cmd: list) -> None:
     STATE_DIR.mkdir(parents=True, exist_ok=True)
     log_path = STATE_DIR / "hook.log"
     with open(log_path, "a") as log_f:
-        proc = subprocess.Popen(cmd, stdout=log_f, stderr=log_f)
+        proc = subprocess.Popen(cmd, stdout=log_f, stderr=log_f, **_detached_popen_kwargs())
     _MINE_PID_FILE.write_text(str(proc.pid))
 
 
@@ -350,6 +371,7 @@ def _desktop_toast(body: str, title: str = "MemPalace"):
             ["notify-send", "--app-name=MemPalace", "--icon=brain", title, body],
             stdout=subprocess.DEVNULL,
             stderr=subprocess.DEVNULL,
+            **_detached_popen_kwargs(),
         )
     except OSError:
         pass
@@ -513,6 +535,7 @@ def _ingest_transcript(transcript_path: str):
                 ],
                 stdout=log_f,
                 stderr=log_f,
+                **_detached_popen_kwargs(),
             )
         _log(f"Transcript ingest started: {path.name}")
     except OSError:
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index 19ecbaf..c4763c9 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -560,6 +560,78 @@ def test_maybe_auto_ingest_skips_when_mine_running(tmp_path):
                     mock_popen.assert_not_called()
 
 
+# --- _detached_popen_kwargs ---
+
+
+def test_detached_popen_kwargs_posix(monkeypatch):
+    """On POSIX, kwargs include start_new_session so the child detaches."""
+    from mempalace.hooks_cli import _detached_popen_kwargs
+
+    monkeypatch.setattr("mempalace.hooks_cli.os.name", "posix")
+    kwargs = _detached_popen_kwargs()
+    assert kwargs.get("start_new_session") is True
+    assert kwargs.get("stdin") is subprocess.DEVNULL
+    assert kwargs.get("close_fds") is True
+    assert "creationflags" not in kwargs
+
+
+def test_detached_popen_kwargs_windows(monkeypatch):
+    """On Windows, kwargs include creationflags that fully detach the child.
+
+    Without these, the parent hook hangs at session end on Windows because
+    the child's inherited stdout/stderr handles keep the parent's exit
+    blocked (#1268 root cause for the Python hook path).
+    """
+    from mempalace.hooks_cli import _detached_popen_kwargs
+
+    monkeypatch.setattr("mempalace.hooks_cli.os.name", "nt")
+    # Simulate Windows-only Popen flag constants. Patch on the imported
+    # subprocess module within hooks_cli so getattr() picks them up.
+    monkeypatch.setattr(
+        "mempalace.hooks_cli.subprocess.DETACHED_PROCESS", 0x00000008, raising=False
+    )
+    monkeypatch.setattr(
+        "mempalace.hooks_cli.subprocess.CREATE_NEW_PROCESS_GROUP", 0x00000200, raising=False
+    )
+    kwargs = _detached_popen_kwargs()
+    assert kwargs.get("stdin") is subprocess.DEVNULL
+    assert kwargs.get("close_fds") is True
+    flags = kwargs.get("creationflags", 0)
+    assert flags & 0x00000008, "DETACHED_PROCESS must be set"
+    assert flags & 0x00000200, "CREATE_NEW_PROCESS_GROUP must be set"
+
+
+def test_spawn_mine_uses_detached_kwargs(tmp_path):
+    """_spawn_mine forwards detached kwargs so the hook can exit cleanly."""
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                mock_popen.return_value.pid = 9999
+                from mempalace.hooks_cli import _spawn_mine
+
+                _spawn_mine(["mempalace", "mine", "/tmp/x"])
+                kwargs = mock_popen.call_args.kwargs
+                # The exact key set varies by platform; assert on the
+                # shared invariants that protect against the Windows hang.
+                assert kwargs.get("stdin") is subprocess.DEVNULL
+                assert kwargs.get("close_fds") is True
+
+
+def test_ingest_transcript_uses_detached_kwargs(tmp_path):
+    """_ingest_transcript spawns the convos mine with detach kwargs."""
+    transcript = tmp_path / "session.jsonl"
+    transcript.write_text("x" * 200)  # > 100 byte gate
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+            from mempalace.hooks_cli import _ingest_transcript
+
+            _ingest_transcript(str(transcript))
+            assert mock_popen.called
+            kwargs = mock_popen.call_args.kwargs
+            assert kwargs.get("stdin") is subprocess.DEVNULL
+            assert kwargs.get("close_fds") is True
+
+
 # --- _mine_already_running ---
 
 
From ef8d83cc8ac295ceae70982b40842b2727413236 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 01:00:00 -0300
Subject: [PATCH 099/127] fix(mine): identify lock holder + exit non-zero on
 contention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a `mempalace mine` collided with another writer (live mcp_server,
another mine, anything taking mine_palace_lock), the operator saw a
generic "another `mempalace mine` is already running" message and the
CLI exited 0 — making the contention invisible to nohup or scripts
checking $?. The reporter ran a `nohup mempalace mine ... & disown`
and got a 200-byte log with only the auto-defaults warning, no clue
that an MCP server was holding the store.

palace.py: the lock file now records the holder's PID + first three
argv tokens on acquire. A failed acquire reads the file and surfaces
"palace <path> is held by PID N (mempalace mcp_server); wait for it
to finish or stop the holder before retrying" in the
MineAlreadyRunning message. Open mode changes from "w" to "a+" so the
prior holder's identity survives long enough to be read.

miner.mine() now lets MineAlreadyRunning propagate. cmd_mine catches
it, prints the holder-aware message to stderr, and exits non-zero so
shell wrappers detect the contention.

Note: this is a behavior change for in-process callers that depended
on miner.mine() silently swallowing MineAlreadyRunning. The silent
swallow was the bug.

Closes #1264
---
 mempalace/cli.py           | 58 ++++++++++++++++------------
 mempalace/miner.py         | 34 +++++++---------
 mempalace/palace.py        | 51 ++++++++++++++++++++++--
 tests/test_cli.py          | 39 +++++++++++++++++++
 tests/test_palace_locks.py | 79 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 214 insertions(+), 47 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 6a531e7..964fa84 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -500,31 +500,41 @@ def cmd_mine(args):
             llm_provider=None,
         )
 
-    if args.mode == "convos":
-        from .convo_miner import mine_convos
+    from .palace import MineAlreadyRunning
 
-        mine_convos(
-            convo_dir=args.dir,
-            palace_path=palace_path,
-            wing=args.wing,
-            agent=args.agent,
-            limit=args.limit,
-            dry_run=args.dry_run,
-            extract_mode=args.extract,
-        )
-    else:
-        from .miner import mine
-
-        mine(
-            project_dir=args.dir,
-            palace_path=palace_path,
-            wing_override=args.wing,
-            agent=args.agent,
-            limit=args.limit,
-            dry_run=args.dry_run,
-            respect_gitignore=not args.no_gitignore,
-            include_ignored=include_ignored,
-        )
+    try:
+        if args.mode == "convos":
+            from .convo_miner import mine_convos
+
+            mine_convos(
+                convo_dir=args.dir,
+                palace_path=palace_path,
+                wing=args.wing,
+                agent=args.agent,
+                limit=args.limit,
+                dry_run=args.dry_run,
+                extract_mode=args.extract,
+            )
+        else:
+            from .miner import mine
+
+            mine(
+                project_dir=args.dir,
+                palace_path=palace_path,
+                wing_override=args.wing,
+                agent=args.agent,
+                limit=args.limit,
+                dry_run=args.dry_run,
+                respect_gitignore=not args.no_gitignore,
+                include_ignored=include_ignored,
+            )
+    except MineAlreadyRunning as exc:
+        # A live MCP server or another mine is already writing to this
+        # palace. Surface the holder identity so the operator knows what
+        # to wait for (or stop), and exit non-zero so wrappers like
+        # nohup / scripts can detect the contention.
+        print(f"mempalace: {exc}", file=sys.stderr)
+        sys.exit(1)
 
 
 def cmd_sweep(args):
diff --git a/mempalace/miner.py b/mempalace/miner.py
index 6aeddd4..09cc517 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -21,7 +21,6 @@
 from .palace import (
     NORMALIZE_VERSION,
     SKIP_DIRS,
-    MineAlreadyRunning,
     build_closet_lines,
     file_already_mined,
     get_closets_collection,
@@ -1035,26 +1034,21 @@ def mine(
             files=files,
         )
 
-    try:
-        with mine_palace_lock(palace_path):
-            return _mine_impl(
-                project_dir,
-                palace_path,
-                wing_override=wing_override,
-                agent=agent,
-                limit=limit,
-                dry_run=dry_run,
-                respect_gitignore=respect_gitignore,
-                include_ignored=include_ignored,
-                files=files,
-            )
-    except MineAlreadyRunning:
-        print(
-            f"mempalace: another `mine` is already running against "
-            f"{palace_path} — exiting cleanly.",
-            file=sys.stderr,
+    # MineAlreadyRunning propagates so the CLI can render a clear holder-aware
+    # message and exit non-zero. In-process callers (tests, library users) that
+    # expect to coexist with another writer should handle the exception.
+    with mine_palace_lock(palace_path):
+        return _mine_impl(
+            project_dir,
+            palace_path,
+            wing_override=wing_override,
+            agent=agent,
+            limit=limit,
+            dry_run=dry_run,
+            respect_gitignore=respect_gitignore,
+            include_ignored=include_ignored,
+            files=files,
         )
-        return
 
 
 def _mine_impl(
diff --git a/mempalace/palace.py b/mempalace/palace.py
index dee5c8f..375b5e1 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -9,6 +9,7 @@
 import logging
 import os
 import re
+import sys
 import threading
 from typing import Optional
 
@@ -364,6 +365,41 @@ def _mark_released(lock_key: str) -> None:
     _holder_state().discard(lock_key)
 
 
+def _format_lock_holder(content: str) -> str:
+    """Render a lock-file body as 'PID N (cmdline)' for diagnostic messages."""
+    parts = content.split(maxsplit=1)
+    if not parts or not parts[0].isdigit():
+        return "another writer (identity not recorded)"
+    pid = parts[0]
+    if len(parts) > 1 and parts[1].strip():
+        return f"PID {pid} ({parts[1].strip()})"
+    return f"PID {pid}"
+
+
+def _read_lock_holder(lock_file) -> str:
+    """Read the prior holder's identity from the lock-file body, best-effort."""
+    try:
+        lock_file.seek(0)
+        content = lock_file.read().strip()
+    except OSError:
+        return "another writer (identity not recorded)"
+    if not content:
+        return "another writer (identity not recorded)"
+    return _format_lock_holder(content)
+
+
+def _write_lock_holder(lock_file) -> None:
+    """Record this process's identity in the lock-file body. Best-effort."""
+    try:
+        ident = f"{os.getpid()} {' '.join(sys.argv[:3])}".strip()
+        lock_file.seek(0)
+        lock_file.truncate()
+        lock_file.write(ident)
+        lock_file.flush()
+    except OSError:
+        pass
+
+
 @contextlib.contextmanager
 def mine_palace_lock(palace_path: str):
     """Per-palace non-blocking lock around the full `mine` pipeline.
@@ -407,7 +443,10 @@ def mine_palace_lock(palace_path: str):
         yield
         return
 
-    lf = open(lock_path, "w")
+    # "a+" preserves the prior holder's identity recorded inside the file so
+    # a failed acquire can name who is holding the lock (#1264). "w" mode
+    # would have truncated the file before we could read it.
+    lf = open(lock_path, "a+")
     acquired = False
     try:
         if os.name == "nt":
@@ -417,8 +456,10 @@ def mine_palace_lock(palace_path: str):
                 msvcrt.locking(lf.fileno(), msvcrt.LK_NBLCK, 1)
                 acquired = True
             except OSError as exc:
+                holder = _read_lock_holder(lf)
                 raise MineAlreadyRunning(
-                    f"another `mempalace mine` is already running against {resolved}"
+                    f"palace {resolved} is held by {holder}; "
+                    "wait for it to finish or stop the holder before retrying"
                 ) from exc
         else:
             import fcntl
@@ -427,9 +468,13 @@ def mine_palace_lock(palace_path: str):
                 fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB)
                 acquired = True
             except BlockingIOError as exc:
+                holder = _read_lock_holder(lf)
                 raise MineAlreadyRunning(
-                    f"another `mempalace mine` is already running against {resolved}"
+                    f"palace {resolved} is held by {holder}; "
+                    "wait for it to finish or stop the holder before retrying"
                 ) from exc
+        # Record our own identity for any later contender's diagnostic message.
+        _write_lock_holder(lf)
         _mark_held(palace_key)
         try:
             yield
diff --git a/tests/test_cli.py b/tests/test_cli.py
index fa5680d..547286d 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -555,6 +555,45 @@ def test_cmd_mine_include_ignored_comma_split(mock_config_cls):
         assert call_kwargs["include_ignored"] == ["a.txt", "b.txt", "c.txt"]
 
 
+@patch("mempalace.cli.MempalaceConfig")
+def test_cmd_mine_exits_nonzero_on_lock_holder(mock_config_cls, capsys):
+    """Regression #1264: lock contention must exit non-zero with a clear message.
+
+    Before this fix the CLI silently returned 0 when another writer held
+    the palace lock — operators using nohup/scripts had no way to detect
+    the contention. The new behavior raises MineAlreadyRunning out of
+    miner.mine() and cmd_mine catches it, printing the holder identity
+    to stderr and exiting non-zero.
+    """
+    from mempalace.palace import MineAlreadyRunning
+
+    mock_config_cls.return_value.palace_path = "/fake/palace"
+    args = argparse.Namespace(
+        dir="/src",
+        palace=None,
+        mode="projects",
+        wing=None,
+        agent="mempalace",
+        limit=0,
+        dry_run=False,
+        no_gitignore=False,
+        include_ignored=[],
+        extract="exchange",
+    )
+    with patch(
+        "mempalace.miner.mine",
+        side_effect=MineAlreadyRunning(
+            "palace /fake/palace is held by PID 12345 (mempalace mcp_server); wait for it to finish"
+        ),
+    ):
+        with pytest.raises(SystemExit) as excinfo:
+            cmd_mine(args)
+    assert excinfo.value.code == 1
+    captured = capsys.readouterr()
+    assert "PID 12345" in captured.err
+    assert "mcp_server" in captured.err
+
+
 # ── cmd_wakeup ─────────────────────────────────────────────────────────
 
 
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index d239757..27235dd 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -208,6 +208,85 @@ def _try_acquire_expect_busy(palace_path, result_q):
         result_q.put("busy")
 
 
+def _hold_lock_send_pid(palace_path: str, ready_flag: str, release_flag: str, pid_q) -> None:
+    """Acquire the lock, push our PID + cmdline through the queue, then wait."""
+    import sys as _sys
+
+    try:
+        with mine_palace_lock(palace_path):
+            pid_q.put((os.getpid(), list(_sys.argv[:3])))
+            open(ready_flag, "w").close()
+            for _ in range(500):
+                if os.path.exists(release_flag):
+                    return
+                time.sleep(0.01)
+    except MineAlreadyRunning:
+        pid_q.put(("error", "raised"))
+
+
+def test_lock_failure_message_names_holder(tmp_path, monkeypatch):
+    """Regression #1264: failed acquire must identify the holder by PID.
+
+    Before this fix, a `mempalace mine` colliding with another writer
+    (mine, MCP server, anything taking mine_palace_lock) saw a generic
+    "another `mempalace mine` is already running" message and exited
+    silently. The operator had no signal of which process to wait for
+    or stop. The new message includes ``PID N`` so the holder can be
+    identified directly.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    ready = str(tmp_path / "ready")
+    release = str(tmp_path / "release")
+
+    ctx = _get_mp_context()
+    pid_q = ctx.Queue()
+    holder = ctx.Process(target=_hold_lock_send_pid, args=(palace, ready, release, pid_q))
+    holder.start()
+    try:
+        for _ in range(500):
+            if os.path.exists(ready):
+                break
+            time.sleep(0.01)
+        assert os.path.exists(ready), "holder failed to acquire lock in time"
+        holder_pid, _holder_argv = pid_q.get(timeout=2)
+
+        with pytest.raises(MineAlreadyRunning) as excinfo:
+            with mine_palace_lock(palace):
+                pytest.fail("second acquire of same palace should have raised")
+
+        msg = str(excinfo.value)
+        assert (
+            f"PID {holder_pid}" in msg
+        ), f"lock-failure message must name the holder PID; got: {msg!r}"
+    finally:
+        open(release, "w").close()
+        holder.join(timeout=5)
+
+
+def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch):
+    """The holder line is overwritten by each new acquirer, not appended.
+
+    Without explicit truncate the lock file would accumulate lines across
+    runs and grow without bound. Verify that re-acquire keeps the body
+    bounded.
+    """
+    monkeypatch.setenv("HOME", str(tmp_path))
+    palace = str(tmp_path / "palace")
+    for _ in range(5):
+        with mine_palace_lock(palace):
+            pass
+
+    # Locate the lock file. The key derivation is internal but we can find
+    # it by scanning the mempalace locks dir for mine_palace_*.lock entries.
+    lock_dir = tmp_path / ".mempalace" / "locks"
+    lock_files = list(lock_dir.glob("mine_palace_*.lock"))
+    assert lock_files, "expected the palace lock file to exist after acquire/release"
+    body = lock_files[0].read_text()
+    # One identity line, no accumulation.
+    assert body.count("\n") <= 1, f"lock body must not grow across re-acquires; got {body!r}"
+
+
 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):
     """Old callers of `mine_global_lock` should still work."""
     monkeypatch.setenv("HOME", str(tmp_path))

From d5ce97c7afe0c533a3a62ec3929c14563ef5a01d Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 01:28:42 -0300
Subject: [PATCH 100/127] fix(palace): reserve byte 0 as lock sentinel for
 Windows portability

Windows CI surfaced two bugs introduced by the holder-identity write:

1. msvcrt.locking(LK_NBLCK, 1) locks 1 byte at the *current* file
   position. Switching to "a+" mode put the position at end-of-file,
   so two contenders locked different bytes and silently both
   acquired (the test asserts saw [(ok, 1), (ok, 2)] instead of
   ok+busy).

2. With the byte-range lock active on Windows, the locked byte is
   read-blocked for other processes. A contender trying to read the
   holder identity from byte 0 would hit PermissionError.

Switch to "r+" mode (after touch-create) and explicitly seek(0) before
both lock and unlock. Then reserve byte 0 as a pure lock sentinel and
write the holder identity from byte 1 onward. _read_lock_holder reads
from byte 1+, so it never touches the locked byte.

Also bound file growth across re-acquires: truncate to
sentinel + len(ident) before writing so the file body stays the size
of the current holder, never accumulating across runs.

Linux fcntl.flock locks the whole file independent of byte position,
so the seek(0) is harmless on POSIX. The shape works on both.
---
 mempalace/palace.py | 45 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/mempalace/palace.py b/mempalace/palace.py
index 375b5e1..7ed315c 100644
--- a/mempalace/palace.py
+++ b/mempalace/palace.py
@@ -376,10 +376,17 @@ def _format_lock_holder(content: str) -> str:
     return f"PID {pid}"
 
 
+# Byte 0 of the lock file is reserved as the OS lock sentinel.
+# Holder identity is written from byte 1 onward so contenders can read
+# the identity without colliding with byte 0 (Windows msvcrt.locking
+# blocks both reads and writes on the locked byte).
+_LOCK_SENTINEL_BYTES = 1
+
+
 def _read_lock_holder(lock_file) -> str:
     """Read the prior holder's identity from the lock-file body, best-effort."""
     try:
-        lock_file.seek(0)
+        lock_file.seek(_LOCK_SENTINEL_BYTES)
         content = lock_file.read().strip()
     except OSError:
         return "another writer (identity not recorded)"
@@ -389,11 +396,16 @@ def _read_lock_holder(lock_file) -> str:
 
 
 def _write_lock_holder(lock_file) -> None:
-    """Record this process's identity in the lock-file body. Best-effort."""
+    """Record this process's identity in the lock-file body. Best-effort.
+
+    Writes from byte 1 onward; byte 0 is the lock sentinel and must not
+    be touched after acquire (truncating it on Windows can interact
+    badly with the active byte-range lock).
+    """
     try:
         ident = f"{os.getpid()} {' '.join(sys.argv[:3])}".strip()
-        lock_file.seek(0)
-        lock_file.truncate()
+        lock_file.seek(_LOCK_SENTINEL_BYTES)
+        lock_file.truncate(_LOCK_SENTINEL_BYTES + len(ident.encode("utf-8")))
         lock_file.write(ident)
         lock_file.flush()
     except OSError:
@@ -443,12 +455,27 @@ def mine_palace_lock(palace_path: str):
         yield
         return
 
-    # "a+" preserves the prior holder's identity recorded inside the file so
-    # a failed acquire can name who is holding the lock (#1264). "w" mode
-    # would have truncated the file before we could read it.
-    lf = open(lock_path, "a+")
+    # Ensure the file exists, then open r+ so we can both read the prior
+    # holder's identity (for failure diagnostics) and write our own. "w"
+    # truncates and erases the prior holder. "a+" puts the position at EOF,
+    # which on Windows breaks ``msvcrt.locking`` (it locks 1 byte at the
+    # *current* position, so two contenders end up locking different bytes
+    # and silently both acquire — observed as Windows-CI lock test
+    # failures during #1264 development).
+    if not os.path.exists(lock_path):
+        # Touch atomically: O_CREAT|O_EXCL would fail if a concurrent
+        # contender just created it, which is fine — we proceed to open.
+        try:
+            fd = os.open(lock_path, os.O_CREAT | os.O_WRONLY, 0o600)
+            os.close(fd)
+        except FileExistsError:
+            pass
+    lf = open(lock_path, "r+")
     acquired = False
     try:
+        # Lock byte 0 explicitly. msvcrt.locking is byte-position dependent;
+        # fcntl.flock is whole-file but the seek is harmless there.
+        lf.seek(0)
         if os.name == "nt":
             import msvcrt
 
@@ -486,6 +513,8 @@ def mine_palace_lock(palace_path: str):
                 if os.name == "nt":
                     import msvcrt
 
+                    # Match the lock region: byte 0.
+                    lf.seek(0)
                     msvcrt.locking(lf.fileno(), msvcrt.LK_UNLCK, 1)
                 else:
                     import fcntl

From 11a35de5ac4758474fbae59b1f348d4f8762130f Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 01:34:46 -0300
Subject: [PATCH 101/127] test(palace): set USERPROFILE too so the lock-path
 test works on Windows

os.path.expanduser("~") reads HOME on POSIX but USERPROFILE on Windows;
the lock-body bound test was monkeypatching HOME only, so on
test-windows the lock file landed in the runner's real ~/.mempalace
and the tmp_path glob found nothing.

Patch USERPROFILE in addition to HOME, and read the body as bytes so
the byte-0 sentinel doesn't trip a UTF-8 decode warning. Assertion
shifts from line-count to size-bound (still detects unbounded growth
across re-acquires).
---
 tests/test_palace_locks.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 27235dd..2e9f82f 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -271,7 +271,11 @@ def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch):
     runs and grow without bound. Verify that re-acquire keeps the body
     bounded.
     """
+    # ``os.path.expanduser("~")`` reads HOME on POSIX but USERPROFILE on
+    # Windows; setting both makes the ``~/.mempalace/locks`` lookup land
+    # under ``tmp_path`` regardless of platform.
     monkeypatch.setenv("HOME", str(tmp_path))
+    monkeypatch.setenv("USERPROFILE", str(tmp_path))
     palace = str(tmp_path / "palace")
     for _ in range(5):
         with mine_palace_lock(palace):
@@ -282,9 +286,13 @@ def test_lock_holder_identity_persists_across_release(tmp_path, monkeypatch):
     lock_dir = tmp_path / ".mempalace" / "locks"
     lock_files = list(lock_dir.glob("mine_palace_*.lock"))
     assert lock_files, "expected the palace lock file to exist after acquire/release"
-    body = lock_files[0].read_text()
-    # One identity line, no accumulation.
-    assert body.count("\n") <= 1, f"lock body must not grow across re-acquires; got {body!r}"
+    # Read as bytes so the byte-0 sentinel (\x00) is preserved without
+    # decode quirks; the bound is on the file size, not its line count.
+    body = lock_files[0].read_bytes()
+    # Body is byte-0 sentinel + identity (no trailing accumulation).
+    # Identity is ``f"{pid} {sys.argv[:3]}"``; cap at a generous bound that
+    # still rules out unbounded growth across the 5 re-acquires.
+    assert len(body) < 1024, f"lock body must not grow across re-acquires; got {len(body)} bytes"
 
 
 def test_mine_global_lock_is_alias_for_back_compat(tmp_path, monkeypatch):

From 25bfd37644a265b83dc3d437df246e46a885d2a0 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 01:37:32 -0300
Subject: [PATCH 102/127] chore(release): sync 3.3.4 version bump back into
 develop

The v3.3.4 release prep landed on main but was never merged back into
develop, leaving every version-bearing file one release behind. Bumps
pyproject.toml, mempalace/version.py, both plugin manifests, the
marketplace entry, the README badge, and the lockfile to 3.3.4 to match
the tagged release.
---
 .claude-plugin/marketplace.json | 2 +-
 .claude-plugin/plugin.json      | 2 +-
 .codex-plugin/plugin.json       | 2 +-
 README.md                       | 2 +-
 mempalace/version.py            | 2 +-
 pyproject.toml                  | 2 +-
 uv.lock                         | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index aa15e5b..9320057 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -9,7 +9,7 @@
       "name": "mempalace",
       "source": "./.claude-plugin",
       "description": "AI memory system — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, guided setup.",
-      "version": "3.3.3",
+      "version": "3.3.4",
       "author": {
         "name": "milla-jovovich"
       }
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index a1b69a6..3794c9d 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "mempalace",
-  "version": "3.3.3",
+  "version": "3.3.4",
   "description": "Give your AI a memory — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, and guided setup.",
   "author": {
     "name": "milla-jovovich"
diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json
index 16b66bb..02d0902 100644
--- a/.codex-plugin/plugin.json
+++ b/.codex-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "mempalace",
-  "version": "3.3.3",
+  "version": "3.3.4",
   "description": "Give your AI a memory — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, and guided setup.",
   "author": {
     "name": "milla-jovovich"
diff --git a/README.md b/README.md
index d82bcd2..f8404a8 100644
--- a/README.md
+++ b/README.md
@@ -185,7 +185,7 @@ PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
 MIT — see [LICENSE](LICENSE).
 
 <!-- Link Definitions -->
-[version-shield]: https://img.shields.io/badge/version-3.3.3-4dc9f6?style=flat-square&labelColor=0a0e14
+[version-shield]: https://img.shields.io/badge/version-3.3.4-4dc9f6?style=flat-square&labelColor=0a0e14
 [release-link]: https://github.com/MemPalace/mempalace/releases
 [python-shield]: https://img.shields.io/badge/python-3.9+-7dd8f8?style=flat-square&labelColor=0a0e14&logo=python&logoColor=7dd8f8
 [python-link]: https://www.python.org/
diff --git a/mempalace/version.py b/mempalace/version.py
index 7f40b31..1db1b9d 100644
--- a/mempalace/version.py
+++ b/mempalace/version.py
@@ -1,3 +1,3 @@
 """Single source of truth for the MemPalace package version."""
 
-__version__ = "3.3.3"
+__version__ = "3.3.4"
diff --git a/pyproject.toml b/pyproject.toml
index 18228d7..ae2ea27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mempalace"
-version = "3.3.3"
+version = "3.3.4"
 description = "Give your AI a memory — mine projects and conversations into a searchable palace. No API key required."
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/uv.lock b/uv.lock
index ef1a706..04f9303 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1169,7 +1169,7 @@ wheels = [
 
 [[package]]
 name = "mempalace"
-version = "3.3.3"
+version = "3.3.4"
 source = { editable = "." }
 dependencies = [
     { name = "chromadb" },

From c35686c9e11a67b09af0951212183eb158a093e3 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 01:37:46 -0300
Subject: [PATCH 103/127] docs(install): recommend uv as the package manager
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-user installs now lead with `uv tool install mempalace`, with
`pip install mempalace` kept as a fallback. Dev/contributor docs lead
with `uv sync --extra dev` and `uv run` for tests/benchmarks/lint, with
the equivalent pip recipe kept inline. The shipped `/mempalace:init`
skill instructions (mempalace/instructions/init.md) try `uv tool install`
first when uv is on PATH, then fall back through the pip variants.

Adds a .python-version pin at 3.12 because the lockfile's
onnxruntime==1.24.3 only ships wheels for Python >=3.11; without the
pin, `uv sync` on a host where uv prefers 3.10 fails with no source
distribution available, which would make the documented command a
footgun. pyproject's `requires-python = ">=3.9"` is unchanged — pip
users on 3.9/3.10 are unaffected.

Files updated: README.md, CONTRIBUTING.md, CLAUDE.md, the gemini-cli
guide and example, the .claude-plugin / .codex-plugin READMEs, the
mempalace SKILL, the openclaw SKILL, tools/save.md, the three
benchmarks docs, and the corresponding website mirrors.
---
 .claude-plugin/README.md                 |  2 +-
 .claude-plugin/skills/mempalace/SKILL.md |  4 ++--
 .codex-plugin/README.md                  |  6 +++---
 .python-version                          |  1 +
 CLAUDE.md                                | 12 +++++------
 CONTRIBUTING.md                          | 14 +++++++-----
 README.md                                | 11 +++++++---
 benchmarks/BENCHMARKS.md                 |  6 +++---
 benchmarks/HYBRID_MODE.md                |  2 +-
 benchmarks/README.md                     |  2 +-
 examples/gemini_cli_setup.md             | 19 +++++++++++------
 integrations/openclaw/SKILL.md           |  4 ++--
 mempalace/instructions/init.md           | 27 ++++++++++++++++--------
 tools/save.md                            |  2 +-
 website/guide/gemini-cli.md              | 18 +++++++++++-----
 website/guide/getting-started.md         |  9 +++++---
 website/reference/benchmarks.md          |  2 +-
 website/reference/contributing.md        | 15 ++++++++-----
 18 files changed, 99 insertions(+), 57 deletions(-)
 create mode 100644 .python-version

diff --git a/.claude-plugin/README.md b/.claude-plugin/README.md
index a2ed080..b6708bb 100644
--- a/.claude-plugin/README.md
+++ b/.claude-plugin/README.md
@@ -23,7 +23,7 @@ claude plugin add /path/to/mempalace
 
 ## Post-Install Setup
 
-After installing the plugin, run the init command to complete setup (pip install, MCP configuration, etc.):
+After installing the plugin, run the init command to complete setup (installs the `mempalace` package via `uv tool` or `pip`, configures MCP, etc.):
 
 ```
 /mempalace:init
diff --git a/.claude-plugin/skills/mempalace/SKILL.md b/.claude-plugin/skills/mempalace/SKILL.md
index ae60fca..1ee3715 100644
--- a/.claude-plugin/skills/mempalace/SKILL.md
+++ b/.claude-plugin/skills/mempalace/SKILL.md
@@ -16,10 +16,10 @@ Ensure `mempalace` is installed:
 mempalace --version
 ```
 
-If not installed:
+If not installed (uv recommended):
 
 ```bash
-pip install mempalace
+uv tool install mempalace   # or: pip install mempalace
 ```
 
 ## Usage
diff --git a/.codex-plugin/README.md b/.codex-plugin/README.md
index 6502eb6..d7f4637 100644
--- a/.codex-plugin/README.md
+++ b/.codex-plugin/README.md
@@ -6,7 +6,7 @@ Give your AI a persistent memory -- mine projects and conversations into a searc
 
 - Python 3.9+
 - Codex CLI installed and configured
-- `pip install mempalace`
+- `uv tool install mempalace` (recommended) or `pip install mempalace`
 
 ## Installation
 
@@ -39,10 +39,10 @@ git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
 ```
 
-2. Install the Python package:
+2. Install the Python package (uv recommended):
 
 ```bash
-pip install -e .
+uv sync   # or: pip install -e .
 ```
 
 3. The `.codex-plugin` directory is already in the repo root. Codex CLI will detect it automatically when you run Codex from inside the repository.
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..e4fba21
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/CLAUDE.md b/CLAUDE.md
index 13dfac3..6f9274e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -36,26 +36,26 @@ We do not accept summarization of user content, cloud storage/sync features, tel
 ## Setup
 
 ```bash
-pip install -e ".[dev]"
+uv sync --extra dev   # recommended; or: pip install -e ".[dev]"
 ```
 
 ## Commands
 
 ```bash
 # Run tests
-python -m pytest tests/ -v --ignore=tests/benchmarks
+uv run pytest tests/ -v --ignore=tests/benchmarks
 
 # Run tests with coverage
-python -m pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing
+uv run pytest tests/ -v --ignore=tests/benchmarks --cov=mempalace --cov-report=term-missing
 
 # Lint
-ruff check .
+uv run ruff check .
 
 # Format
-ruff format .
+uv run ruff format .
 
 # Format check (CI mode)
-ruff format --check .
+uv run ruff format --check .
 ```
 
 ## Project Structure
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 9c6501d..c93d379 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -10,13 +10,17 @@ git clone https://github.com/<your-username>/mempalace.git
 cd mempalace
 git remote add upstream https://github.com/MemPalace/mempalace.git
 
-pip install -e ".[dev]"    # installs with dev dependencies (pytest, build, twine)
+# Recommended: uv (https://docs.astral.sh/uv/) handles the venv for you
+uv sync --extra dev
+
+# Or with pip in your own venv:
+# pip install -e ".[dev]"
 ```
 
 ## Running Tests
 
 ```bash
-pytest tests/ -v
+uv run pytest tests/ -v
 ```
 
 All tests must pass before submitting a PR. Tests should run without API keys or network access.
@@ -25,10 +29,10 @@ All tests must pass before submitting a PR. Tests should run without API keys or
 
 ```bash
 # Quick test (20 questions, ~30 seconds)
-python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json --limit 20
+uv run python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json --limit 20
 
 # Full benchmark (500 questions, ~5 minutes)
-python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
+uv run python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
 ```
 
 See [benchmarks/README.md](benchmarks/README.md) for data download instructions and reproduction guide.
@@ -49,7 +53,7 @@ assets/             ← logo + brand
 1. Fork the repo and create a feature branch: `git checkout -b feat/my-thing`
 2. Write your code
 3. Add or update tests if applicable
-4. Run `pytest tests/ -v` — everything must pass
+4. Run `uv run pytest tests/ -v` — everything must pass
 5. Commit with a clear message following [conventional commits](https://www.conventionalcommits.org/):
    - `feat: add Notion export format`
    - `fix: handle empty transcript files`
diff --git a/README.md b/README.md
index f8404a8..28207f3 100644
--- a/README.md
+++ b/README.md
@@ -49,11 +49,16 @@ Architecture, concepts, and mining flows:
 
 ## Install
 
+We recommend [`uv`](https://docs.astral.sh/uv/) — `uv tool install` puts
+the `mempalace` CLI in an isolated environment on your PATH:
+
 ```bash
-pip install mempalace
+uv tool install mempalace
 mempalace init ~/projects/myapp
 ```
 
+If you prefer pip, `pip install mempalace` still works.
+
 ## Quickstart
 
 ```bash
@@ -120,9 +125,9 @@ own research page for their published numbers.
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 # see benchmarks/README.md for dataset download commands
-python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
+uv run python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
 ```
 
 ---
diff --git a/benchmarks/BENCHMARKS.md b/benchmarks/BENCHMARKS.md
index 77a963e..755e950 100644
--- a/benchmarks/BENCHMARKS.md
+++ b/benchmarks/BENCHMARKS.md
@@ -344,7 +344,7 @@ The palace classifies each question into one of 5 halls. Pass 1 searches only wi
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 mkdir -p /tmp/longmemeval-data
 curl -fsSL -o /tmp/longmemeval-data/longmemeval_s_cleaned.json \
   https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned/resolve/main/longmemeval_s_cleaned.json
@@ -724,8 +724,8 @@ python benchmarks/longmemeval_bench.py /tmp/longmemeval-data/longmemeval_s_clean
 The question: how much of the 96.6% → 99.4% improvement is the heuristics, and how much would come from just using a better embedding model?
 
 ```bash
-pip install fastembed
-python benchmarks/longmemeval_bench.py /tmp/longmemeval-data/longmemeval_s_cleaned.json \
+uv pip install fastembed   # or: pip install fastembed
+uv run python benchmarks/longmemeval_bench.py /tmp/longmemeval-data/longmemeval_s_cleaned.json \
   --mode raw --embed-model bge-large
 ```
 
diff --git a/benchmarks/HYBRID_MODE.md b/benchmarks/HYBRID_MODE.md
index 37f315e..9a32596 100644
--- a/benchmarks/HYBRID_MODE.md
+++ b/benchmarks/HYBRID_MODE.md
@@ -198,7 +198,7 @@ python benchmarks/longmemeval_bench.py data/longmemeval_s_cleaned.json --mode hy
 # Setup
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 
 # Download data
 mkdir -p /tmp/longmemeval-data
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 417ef05..5216e66 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -7,7 +7,7 @@ Run the exact same benchmarks we report. Clone, install, run.
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 ```
 
 ## Benchmark 1: LongMemEval (500 questions)
diff --git a/examples/gemini_cli_setup.md b/examples/gemini_cli_setup.md
index 22bfc67..c1989d5 100644
--- a/examples/gemini_cli_setup.md
+++ b/examples/gemini_cli_setup.md
@@ -9,17 +9,24 @@ This guide explains how to set up MemPalace as a permanent memory for the [Gemin
 
 ## 1. Installation
 
-On many Linux systems, installing Python packages globally is restricted. We recommend using a local virtual environment within the MemPalace directory.
+On many Linux systems, installing Python packages globally is restricted. We
+recommend [`uv`](https://docs.astral.sh/uv/), which creates and manages a
+local virtual environment for you.
 
 ```bash
 # Clone the repository (if you haven't already)
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
 
-# Create a virtual environment
-python3 -m venv .venv
+# Create the venv and install MemPalace + dependencies in editable mode
+uv sync
+```
+
+This produces a `.venv/` directory inside the repo with everything installed.
+If you prefer plain pip:
 
-# Install dependencies and MemPalace in editable mode
+```bash
+python3 -m venv .venv
 .venv/bin/pip install -e .
 ```
 
@@ -29,7 +36,7 @@ Set up your "Palace" (the database) and configure your identity.
 
 ```bash
 # Initialize the palace in the current directory
-.venv/bin/python3 -m mempalace init .
+uv run python -m mempalace init .
 ```
 
 ### Identity and Wings (Optional but Recommended)
@@ -86,7 +93,7 @@ Once connected, Gemini CLI will automatically:
 ### Manual Mining
 If you want the AI to learn from your existing code or docs immediately, run the "mine" command:
 ```bash
-.venv/bin/python3 -m mempalace mine /path/to/your/project
+uv run python -m mempalace mine /path/to/your/project
 ```
 
 ### Verification
diff --git a/integrations/openclaw/SKILL.md b/integrations/openclaw/SKILL.md
index 31ae2cb..4ed4ba0 100644
--- a/integrations/openclaw/SKILL.md
+++ b/integrations/openclaw/SKILL.md
@@ -102,10 +102,10 @@ You have access to a local memory palace via MCP tools. The palace stores verbat
 
 ## Setup
 
-Install MemPalace and populate the palace:
+Install MemPalace and populate the palace (uv recommended):
 
 ```bash
-pip install mempalace
+uv tool install mempalace   # or: pip install mempalace
 mempalace init ~/my-convos
 mempalace mine ~/my-convos
 ```
diff --git a/mempalace/instructions/init.md b/mempalace/instructions/init.md
index 570a525..b43ecc4 100644
--- a/mempalace/instructions/init.md
+++ b/mempalace/instructions/init.md
@@ -11,27 +11,36 @@ tell the user they need Python 3.9+ installed and stop.
 
 ## Step 2: Check if mempalace is already installed
 
-Run `pip show mempalace` to see if the package is already present. If it is,
-report the installed version and skip to Step 4.
+Run `mempalace --version` to see if the CLI is already on the user's PATH.
+If it succeeds, report the installed version and skip to Step 4.
+
+If that fails, fall back to `pip show mempalace` (and `uv tool list` if `uv`
+is available) to detect an existing install. If found, report it and skip
+to Step 4.
 
 ## Step 3: Install mempalace
 
-Run `pip install mempalace`.
+Prefer [`uv`](https://docs.astral.sh/uv/) — it isolates the CLI from system
+Python and avoids most environment-related failures:
+
+1. If `uv` is on PATH (`uv --version`), run `uv tool install mempalace`.
+2. Otherwise run `pip install mempalace`.
 
-### Error handling -- pip failures
+### Error handling -- install failures
 
-If `pip install mempalace` fails, try these fallbacks in order:
+If the install command fails, try these fallbacks in order:
 
-1. Try `pip3 install mempalace`
-2. Try `python -m pip install mempalace` (or `python3 -m pip install mempalace`)
-3. If the error mentions missing build tools or compilation failures (commonly
+1. If `uv tool install` failed, try `pip install mempalace` (or vice versa).
+2. Try `pip3 install mempalace`.
+3. Try `python -m pip install mempalace` (or `python3 -m pip install mempalace`).
+4. If the error mentions missing build tools or compilation failures (commonly
    from chromadb or its native dependencies):
    - On Linux/macOS: suggest `sudo apt-get install build-essential python3-dev`
      (Debian/Ubuntu) or `xcode-select --install` (macOS)
    - On Windows: suggest installing Microsoft C++ Build Tools from
      https://visualstudio.microsoft.com/visual-cpp-build-tools/
    - Then retry the install command
-4. If all attempts fail, report the error clearly and stop.
+5. If all attempts fail, report the error clearly and stop.
 
 ## Step 4: Ask for project directory
 
diff --git a/tools/save.md b/tools/save.md
index 914156b..c2e6748 100644
--- a/tools/save.md
+++ b/tools/save.md
@@ -23,4 +23,4 @@ Behavior:
    `--wing my_research`).
 4. Report back: how many drawers were filed, into which wing/room.
 
-Requires `mempalace` to be installed (`pip install mempalace`).
+Requires `mempalace` to be installed (`uv tool install mempalace` recommended, or `pip install mempalace`).
diff --git a/website/guide/gemini-cli.md b/website/guide/gemini-cli.md
index 137d62c..aa454fe 100644
--- a/website/guide/gemini-cli.md
+++ b/website/guide/gemini-cli.md
@@ -9,22 +9,30 @@ MemPalace works natively with [Gemini CLI](https://github.com/google/gemini-cli)
 
 ## Installation
 
+We recommend [`uv`](https://docs.astral.sh/uv/) — it creates and manages the
+virtual environment for you:
+
 ```bash
 # Clone the repository
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
 
-# Create a virtual environment
-python3 -m venv .venv
+# Create the venv and install MemPalace + dependencies
+uv sync
+```
 
-# Install dependencies
+This produces a `.venv/` directory with the project installed in editable
+mode. If you prefer plain pip, the equivalent is:
+
+```bash
+python3 -m venv .venv
 .venv/bin/pip install -e .
 ```
 
 ## Initialize the Palace
 
 ```bash
-.venv/bin/python3 -m mempalace init .
+uv run python -m mempalace init .
 ```
 
 ### Identity and Project Configuration (Optional)
@@ -88,7 +96,7 @@ Once connected, Gemini CLI will automatically:
 
 Mine existing code or docs:
 ```bash
-.venv/bin/python3 -m mempalace mine /path/to/your/project
+uv run python -m mempalace mine /path/to/your/project
 ```
 
 ### Verification
diff --git a/website/guide/getting-started.md b/website/guide/getting-started.md
index 2dc921d..8a3dff9 100644
--- a/website/guide/getting-started.md
+++ b/website/guide/getting-started.md
@@ -2,12 +2,15 @@
 
 ## Installation
 
-Install MemPalace from PyPI:
+We recommend [`uv`](https://docs.astral.sh/uv/) — `uv tool install` puts
+the `mempalace` CLI in an isolated environment on your PATH:
 
 ```bash
-pip install mempalace
+uv tool install mempalace
 ```
 
+If you prefer pip, `pip install mempalace` still works.
+
 ::: danger Security Warning
 The domain `mempalace.tech` is a **brand-squatting site** not affiliated with this project. It is known to run ad-redirects and potential malware. The official MemPalace distribution is only available via this [GitHub repository](https://github.com/MemPalace/mempalace) and [PyPI](https://pypi.org/project/mempalace/). Never install binaries or scripts from unofficial domains.
 :::
@@ -25,7 +28,7 @@ No API key required for the core local workflow. After installation, the main st
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 ```
 
 ## Quick Start
diff --git a/website/reference/benchmarks.md b/website/reference/benchmarks.md
index 60bc8cb..2cc5feb 100644
--- a/website/reference/benchmarks.md
+++ b/website/reference/benchmarks.md
@@ -113,7 +113,7 @@ Every benchmark runs deterministically from this repository.
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+uv sync --extra dev   # or: pip install -e ".[dev]"
 
 # LongMemEval — raw (96.6%)
 curl -fsSL -o /tmp/longmemeval_s_cleaned.json \
diff --git a/website/reference/contributing.md b/website/reference/contributing.md
index 2b909b4..f7f1513 100644
--- a/website/reference/contributing.md
+++ b/website/reference/contributing.md
@@ -7,13 +7,18 @@ PRs welcome. MemPalace is open source and we welcome contributions of all sizes
 ```bash
 git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
-pip install -e ".[dev]"
+
+# Recommended: uv (https://docs.astral.sh/uv/) manages the venv for you
+uv sync --extra dev
+
+# Or with pip in your own venv:
+# pip install -e ".[dev]"
 ```
 
 ## Running Tests
 
 ```bash
-pytest tests/ -v
+uv run pytest tests/ -v
 ```
 
 All tests must pass before submitting a PR. Tests should run without API keys or network access.
@@ -22,10 +27,10 @@ All tests must pass before submitting a PR. Tests should run without API keys or
 
 ```bash
 # Quick test (20 questions, ~30 seconds)
-python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json --limit 20
+uv run python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json --limit 20
 
 # Full benchmark (500 questions, ~5 minutes)
-python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
+uv run python benchmarks/longmemeval_bench.py /path/to/longmemeval_s_cleaned.json
 ```
 
 See [Benchmarks](/reference/benchmarks) for data download instructions.
@@ -35,7 +40,7 @@ See [Benchmarks](/reference/benchmarks) for data download instructions.
 1. Fork the repo and create a feature branch: `git checkout -b feat/my-thing`
 2. Write your code
 3. Add or update tests if applicable
-4. Run `pytest tests/ -v` — everything must pass
+4. Run `uv run pytest tests/ -v` — everything must pass
 5. Commit with clear [conventional commits](https://www.conventionalcommits.org/):
    - `feat: add Notion export format`
    - `fix: handle empty transcript files`

From c08ec00f4d2a94586fe669f8cb503ff7afff1905 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 02:01:36 -0300
Subject: [PATCH 104/127] fix: address Copilot review on PATH visibility (PR
 #1414)

- mempalace/instructions/init.md: only skip Step 4 when `mempalace
  --version` succeeds. `pip show` / `uv tool list` reporting an install
  is not enough -- if the package lives in an unactivated venv, Step 5
  (`mempalace init ...`) fails with command-not-found. Treat that case
  as not-installed and re-install via Step 3 into a PATH-visible
  location.
- .codex-plugin/README.md: switch the git-install recipe from `uv sync`
  to `uv tool install --editable .` so the bundled `plugin.json`
  (which invokes `mempalace-mcp` by bare name) can launch the MCP
  server. Plain `uv sync` only puts the script in `.venv/bin/`, which
  Codex won't find unless the venv is activated first.
---
 .codex-plugin/README.md        |  9 +++++++--
 mempalace/instructions/init.md | 15 +++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/.codex-plugin/README.md b/.codex-plugin/README.md
index d7f4637..2af714c 100644
--- a/.codex-plugin/README.md
+++ b/.codex-plugin/README.md
@@ -39,12 +39,17 @@ git clone https://github.com/MemPalace/mempalace.git
 cd mempalace
 ```
 
-2. Install the Python package (uv recommended):
+2. Install the Python package so the `mempalace-mcp` script lands on
+   your PATH (the bundled `plugin.json` invokes it by bare name):
 
 ```bash
-uv sync   # or: pip install -e .
+uv tool install --editable .   # or: pip install -e .
 ```
 
+   Plain `uv sync` is **not** enough here — it installs the scripts into
+   `.venv/bin/`, which Codex will not find unless you activate the venv
+   before launching Codex.
+
 3. The `.codex-plugin` directory is already in the repo root. Codex CLI will detect it automatically when you run Codex from inside the repository.
 
 4. Initialize your palace:
diff --git a/mempalace/instructions/init.md b/mempalace/instructions/init.md
index b43ecc4..347367a 100644
--- a/mempalace/instructions/init.md
+++ b/mempalace/instructions/init.md
@@ -11,12 +11,15 @@ tell the user they need Python 3.9+ installed and stop.
 
 ## Step 2: Check if mempalace is already installed
 
-Run `mempalace --version` to see if the CLI is already on the user's PATH.
-If it succeeds, report the installed version and skip to Step 4.
-
-If that fails, fall back to `pip show mempalace` (and `uv tool list` if `uv`
-is available) to detect an existing install. If found, report it and skip
-to Step 4.
+Run `mempalace --version`. If it succeeds, the CLI is on PATH — report
+the installed version and skip to Step 4.
+
+If `mempalace --version` fails, **do not** skip to Step 4 just because
+`pip show mempalace` or `uv tool list` reports the package as installed:
+the package may live inside a venv that isn't activated, in which case
+Step 5 (`mempalace init ...`) will fail with `command not found`. Treat
+that case as not-installed and continue to Step 3, which will (re)install
+into a PATH-visible location via `uv tool install` or `pip`.
 
 ## Step 3: Install mempalace
 

From 3a763603011df4b8989c987b1157ebd19626af28 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 02:09:00 -0300
Subject: [PATCH 105/127] fix(hooks): per-target PID guard with atomic claim
 (#1212, #1206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The hook PID guard used a single global ``~/.mempalace/hook_state/mine.pid``
file, which failed two ways:

1. ``_mine_already_running`` read-then-spawn was a TOCTOU race. Two
   near-simultaneous Stop hook fires both passed the existence/liveness
   check before either wrote — so both ended up calling
   ``_spawn_mine``.

2. ``_spawn_mine`` unconditionally overwrote the global PID file with
   the new child's PID. The first PID was lost, orphaning the first
   child. The user-visible result in #1212 was two concurrent
   ``mempalace mine`` processes running against the same source, both
   driving HNSW inserts in parallel — exactly the corruption pattern
   the guard was meant to prevent. #1206 reported the same shape from
   the perspective of the user (two mines hung on a 350MB folder).

Replace the global file with per-target slots under
``~/.mempalace/hook_state/mine_pids/``, keyed by sha256 of the mine
sub-arguments (everything after ``mine``). The slot is claimed via
``O_CREAT | O_EXCL`` so the claim is atomic — two simultaneous fires
can never both pass.  Stale slots (PID exists but is dead) are
reclaimed transparently. Different targets (e.g. project mine vs
transcript ingest, or two different MEMPAL_DIRs) get independent
slots and run in parallel.

The mine subprocess receives its slot path via
``MEMPALACE_MINE_PID_FILE`` env var; ``miner._cleanup_mine_pid_file``
reads that var on exit and removes the slot if it points at our PID,
so orphaned slots from crashed mines don't accumulate.

Also routes ``_ingest_transcript`` through ``_spawn_mine`` so the
transcript ingest path now participates in the same dedup — repeated
Stop fires for the same transcript no longer stack parallel mines.

Closes #1212
Closes #1206
---
 mempalace/hooks_cli.py  | 174 ++++++++++++++++++++++++------
 mempalace/miner.py      |  37 ++++---
 tests/test_hooks_cli.py | 231 ++++++++++++++++++++++++++++++++++------
 tests/test_miner.py     |  20 ++--
 4 files changed, 368 insertions(+), 94 deletions(-)

diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index 258f1e0..f7753a0 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -6,6 +6,7 @@
 Supported harnesses: claude-code, codex (extensible to cursor, gemini, etc.)
 """
 
+import hashlib
 import json
 import os
 import re
@@ -256,7 +257,45 @@ def _get_mine_targets() -> list[tuple[str, str]]:
     return targets
 
 
-_MINE_PID_FILE = STATE_DIR / "mine.pid"
+# Per-target PID guard.
+#
+# Hook fires ingest mines in the background. If a previous fire's child is
+# still running for the *same* target (same source dir, mode, wing), the new
+# fire should skip rather than pile up — multiple concurrent mines against the
+# same source corrupt the HNSW index and exhaust disk via duplicate upserts
+# (#1212, #1206). But mines targeting *different* sources / modes must remain
+# independent so the user can have e.g. project-mining and transcript-ingest
+# running in parallel.
+#
+# The single ``mine.pid`` global file used previously failed both ways: the
+# guard was rebuilt every spawn (so two near-simultaneous fires both passed
+# the check before either wrote), and the file was unconditionally overwritten
+# (so the second spawn lost the first PID, orphaning it). The replacement is
+# a directory of per-target slots, claimed via ``O_CREAT | O_EXCL`` so the
+# claim is atomic and per-target.
+_MINE_PID_DIR = STATE_DIR / "mine_pids"
+
+# The per-process PID file path is communicated to the mine subprocess via
+# this env var so the child's cleanup hook (in miner.py) can remove its
+# own slot on exit without scanning the whole directory.
+_MINE_PID_FILE_ENV = "MEMPALACE_MINE_PID_FILE"
+
+
+def _pid_file_for_cmd(cmd: list[str]) -> Path:
+    """Return the per-target PID file path for a mine subcommand.
+
+    The key is derived from the mine arguments (everything after ``mine``)
+    so different (dir, mode, wing) combinations get independent slots.
+    Two fires with the same arguments collapse to the same slot — which is
+    exactly the dedup we want.
+    """
+    try:
+        idx = cmd.index("mine")
+        key = " ".join(cmd[idx:])
+    except ValueError:
+        key = " ".join(cmd)
+    digest = hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]
+    return _MINE_PID_DIR / f"mine_{digest}.pid"
 
 
 def _pid_alive(pid: int) -> bool:
@@ -292,22 +331,96 @@ def _pid_alive(pid: int) -> bool:
         return False
 
 
-def _mine_already_running() -> bool:
-    """Return True if a background mine process from a previous hook fire is still alive."""
+def _mine_already_running(cmd: list[str]) -> bool:
+    """Return True if a previous mine for ``cmd``'s target is still alive."""
+    pid_file = _pid_file_for_cmd(cmd)
     try:
-        pid = int(_MINE_PID_FILE.read_text().strip())
-    except (OSError, ValueError):
+        recorded = pid_file.read_text().strip()
+    except OSError:
+        return False
+    if not recorded.isdigit():
         return False
-    return _pid_alive(pid)
+    return _pid_alive(int(recorded))
+
+
+def _claim_mine_slot(cmd: list[str]) -> Path | None:
+    """Atomically reserve the per-target PID slot for ``cmd``.
+
+    Returns the slot path on success, or ``None`` if the target is
+    already being mined by a live process. The reservation is done via
+    ``O_CREAT | O_EXCL`` so two simultaneous hook fires can never both
+    pass the check; one wins, the other returns None.
+
+    A stale slot (file exists but the recorded PID is dead) is reclaimed
+    transparently — orphan miners that crashed without cleanup do not
+    block future hook fires forever.
+    """
+    pid_file = _pid_file_for_cmd(cmd)
+    pid_file.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        fd = os.open(str(pid_file), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
+        os.close(fd)
+        return pid_file
+    except FileExistsError:
+        pass
+    # Slot exists. If the holder is alive, defer.
+    if _mine_already_running(cmd):
+        return None
+    # Stale entry; reclaim. The unlink+create is racy against another hook
+    # firing right now, but the second create's O_EXCL will fail and that
+    # caller will see the live PID via the next round.
+    try:
+        pid_file.unlink()
+    except FileNotFoundError:
+        pass
+    except OSError:
+        return None
+    try:
+        fd = os.open(str(pid_file), os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o600)
+        os.close(fd)
+        return pid_file
+    except FileExistsError:
+        return None
 
 
 def _spawn_mine(cmd: list) -> None:
-    """Spawn a mine subprocess, write its PID to the lock file, log to hook.log."""
+    """Spawn a mine subprocess if no live mine is already targeting it.
+
+    The PID slot is claimed atomically *before* the spawn, so two near-
+    simultaneous hook fires can't both proceed — the second sees the
+    claimed slot and silently skips. The spawned process inherits a
+    ``MEMPALACE_MINE_PID_FILE`` env var so its cleanup hook can remove
+    the slot on exit without scanning the directory.
+    """
     STATE_DIR.mkdir(parents=True, exist_ok=True)
     log_path = STATE_DIR / "hook.log"
+    pid_file = _claim_mine_slot(cmd)
+    if pid_file is None:
+        _log(f"Skipping mine: target already running ({' '.join(cmd[-3:])})")
+        return
+    child_env = os.environ.copy()
+    child_env[_MINE_PID_FILE_ENV] = str(pid_file)
     with open(log_path, "a") as log_f:
-        proc = subprocess.Popen(cmd, stdout=log_f, stderr=log_f, **_detached_popen_kwargs())
-    _MINE_PID_FILE.write_text(str(proc.pid))
+        try:
+            proc = subprocess.Popen(
+                cmd,
+                stdout=log_f,
+                stderr=log_f,
+                env=child_env,
+                **_detached_popen_kwargs(),
+            )
+        except OSError:
+            # Spawn failed; release the slot we just claimed so the next
+            # hook fire can try again rather than skipping forever.
+            try:
+                pid_file.unlink()
+            except OSError:
+                pass
+            raise
+    try:
+        pid_file.write_text(str(proc.pid))
+    except OSError:
+        pass
 
 
 def _maybe_auto_ingest():
@@ -317,13 +430,15 @@ def _maybe_auto_ingest():
     in the hook handlers — this function does not handle them, to avoid
     asymmetric interpreter handling and PID-file overwrite when both
     targets fire from a single hook call (#1231 review).
+
+    Per-target dedup is done by ``_spawn_mine`` itself: each (dir, mode)
+    target gets its own PID slot, so distinct targets never block each
+    other but a re-fire of the same target while the previous one is
+    still running is silently skipped.
     """
     targets = _get_mine_targets()
     if not targets:
         return
-    if _mine_already_running():
-        _log("Skipping auto-ingest: mine already running")
-        return
     for mine_dir, mode in targets:
         try:
             _spawn_mine([_mempalace_python(), "-m", "mempalace", "mine", mine_dir, "--mode", mode])
@@ -518,25 +633,22 @@ def _ingest_transcript(transcript_path: str):
         return
 
     try:
-        log_path = STATE_DIR / "hook.log"
-        STATE_DIR.mkdir(parents=True, exist_ok=True)
-        with open(log_path, "a") as log_f:
-            subprocess.Popen(
-                [
-                    _mempalace_python(),
-                    "-m",
-                    "mempalace",
-                    "mine",
-                    str(path.parent),
-                    "--mode",
-                    "convos",
-                    "--wing",
-                    "sessions",
-                ],
-                stdout=log_f,
-                stderr=log_f,
-                **_detached_popen_kwargs(),
-            )
+        # Route through ``_spawn_mine`` so the per-target PID guard kicks
+        # in here too — repeated Stop/PreCompact fires for the same
+        # transcript should not stack up parallel ingest mines.
+        _spawn_mine(
+            [
+                _mempalace_python(),
+                "-m",
+                "mempalace",
+                "mine",
+                str(path.parent),
+                "--mode",
+                "convos",
+                "--wing",
+                "sessions",
+            ]
+        )
         _log(f"Transcript ingest started: {path.name}")
     except OSError:
         pass
diff --git a/mempalace/miner.py b/mempalace/miner.py
index 09cc517..e919c58 100644
--- a/mempalace/miner.py
+++ b/mempalace/miner.py
@@ -1206,30 +1206,29 @@ def _mine_impl(
 
 
 def _cleanup_mine_pid_file() -> None:
-    """Remove the global mine PID file if it currently points at us.
-
-    The PID file (``~/.mempalace/hook_state/mine.pid``, written by the
-    hook in :func:`mempalace.hooks_cli._spawn_mine`) tracks the PID of
-    the most recently spawned mine subprocess so the hook can dedup
-    concurrent auto-ingest fires. When that subprocess exits — cleanly,
-    on error, or via Ctrl-C — it should remove its own entry so the
-    next hook fire isn't briefly fooled by a stale PID before
-    ``_pid_alive`` returns False.
-
-    We only delete the file if it claims our own PID; any other PID is
-    left alone (could be an unrelated mine running concurrently from
-    a different worktree / session).
+    """Remove this process's per-target PID slot on exit.
+
+    Hook-spawned mines receive ``MEMPALACE_MINE_PID_FILE`` in their env
+    pointing at the slot the hook claimed for them
+    (``~/.mempalace/hook_state/mine_pids/mine_<sha>.pid``). When the
+    subprocess exits — cleanly, on error, or via Ctrl-C — it removes its
+    own slot so the next hook fire isn't briefly fooled by a stale PID
+    before ``_pid_alive`` returns False.
+
+    Only delete the slot if it claims our own PID; any other PID is left
+    alone (it could belong to an unrelated mine that just claimed the
+    same slot via a stale-reclaim race).
     """
-    try:
-        from .hooks_cli import _MINE_PID_FILE
-    except Exception:
+    pid_file_env = os.environ.get("MEMPALACE_MINE_PID_FILE", "")
+    if not pid_file_env:
         return
     try:
-        if not _MINE_PID_FILE.exists():
+        pid_file = Path(pid_file_env)
+        if not pid_file.exists():
             return
-        recorded = _MINE_PID_FILE.read_text().strip()
+        recorded = pid_file.read_text().strip()
         if recorded and recorded.isdigit() and int(recorded) == os.getpid():
-            _MINE_PID_FILE.unlink()
+            pid_file.unlink()
     except OSError:
         # Best-effort cleanup; never fail the mine over PID bookkeeping.
         pass
diff --git a/tests/test_hooks_cli.py b/tests/test_hooks_cli.py
index c4763c9..0918255 100644
--- a/tests/test_hooks_cli.py
+++ b/tests/test_hooks_cli.py
@@ -3,6 +3,7 @@
 import json
 import os
 import subprocess
+import sys
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 
@@ -441,7 +442,7 @@ def test_maybe_auto_ingest_with_env(tmp_path):
     mempal_dir.mkdir()
     with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
         with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-            with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+            with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
                 with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
                     _maybe_auto_ingest()
                     mock_popen.assert_called_once()
@@ -463,7 +464,7 @@ def test_maybe_auto_ingest_uses_mempalace_python(tmp_path):
     mempal_dir.mkdir()
     with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
         with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-            with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+            with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
                 with patch(
                     "mempalace.hooks_cli._mempalace_python", return_value="/fake/venv/python"
                 ):
@@ -513,7 +514,7 @@ def test_maybe_auto_ingest_ignores_transcript_arg_path(tmp_path):
     transcript.write_text("")
     with patch.dict("os.environ", {}, clear=True):
         with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-            with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+            with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
                 with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
                     _maybe_auto_ingest()
                     mock_popen.assert_not_called()
@@ -543,21 +544,38 @@ def test_maybe_auto_ingest_oserror(tmp_path):
     mempal_dir.mkdir()
     with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
         with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-            with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+            with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
                 with patch("mempalace.hooks_cli.subprocess.Popen", side_effect=OSError("fail")):
                     _maybe_auto_ingest()  # should not raise
 
 
 def test_maybe_auto_ingest_skips_when_mine_running(tmp_path):
-    """Does not spawn a new mine process if one is already running."""
+    """Does not spawn a new mine process if a mine for the same target is alive."""
     mempal_dir = tmp_path / "project"
     mempal_dir.mkdir()
+    pid_dir = tmp_path / "mine_pids"
     with patch.dict("os.environ", {"MEMPAL_DIR": str(mempal_dir)}):
         with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-            with patch("mempalace.hooks_cli._mine_already_running", return_value=True):
-                with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
-                    _maybe_auto_ingest()
-                    mock_popen.assert_not_called()
+            with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+                # Pre-populate the per-target slot with a live PID (our own).
+                from mempalace.hooks_cli import _pid_file_for_cmd
+
+                cmd = [
+                    sys.executable,
+                    "-m",
+                    "mempalace",
+                    "mine",
+                    str(mempal_dir.resolve()),
+                    "--mode",
+                    "projects",
+                ]
+                pid_file = _pid_file_for_cmd(cmd)
+                pid_file.parent.mkdir(parents=True, exist_ok=True)
+                pid_file.write_text(str(os.getpid()))
+                with patch("mempalace.hooks_cli._mempalace_python", return_value=sys.executable):
+                    with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                        _maybe_auto_ingest()
+                        mock_popen.assert_not_called()
 
 
 # --- _detached_popen_kwargs ---
@@ -604,7 +622,7 @@ def test_detached_popen_kwargs_windows(monkeypatch):
 def test_spawn_mine_uses_detached_kwargs(tmp_path):
     """_spawn_mine forwards detached kwargs so the hook can exit cleanly."""
     with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-        with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
             with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
                 mock_popen.return_value.pid = 9999
                 from mempalace.hooks_cli import _spawn_mine
@@ -617,52 +635,195 @@ def test_spawn_mine_uses_detached_kwargs(tmp_path):
                 assert kwargs.get("close_fds") is True
 
 
+def test_spawn_mine_skips_when_target_running(tmp_path):
+    """A second spawn for the same cmd target while the first is alive must skip."""
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            from mempalace.hooks_cli import _pid_file_for_cmd, _spawn_mine
+
+            cmd = ["mempalace", "mine", "/tmp/proj", "--mode", "projects"]
+            pid_file = _pid_file_for_cmd(cmd)
+            pid_file.parent.mkdir(parents=True, exist_ok=True)
+            pid_file.write_text(str(os.getpid()))  # live PID
+
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                _spawn_mine(cmd)
+                mock_popen.assert_not_called()
+
+
+def test_spawn_mine_distinct_targets_dont_block_each_other(tmp_path):
+    """Two spawn calls for *different* targets both proceed."""
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                from mempalace.hooks_cli import _spawn_mine
+
+                mock_popen.return_value.pid = 1111
+                _spawn_mine(["mempalace", "mine", "/tmp/a", "--mode", "projects"])
+                mock_popen.return_value.pid = 2222
+                _spawn_mine(["mempalace", "mine", "/tmp/b", "--mode", "projects"])
+                assert mock_popen.call_count == 2
+
+
+def test_spawn_mine_reclaims_stale_slot(tmp_path):
+    """A slot pointing at a dead PID is reclaimed silently."""
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            from mempalace.hooks_cli import _pid_file_for_cmd, _spawn_mine
+
+            cmd = ["mempalace", "mine", "/tmp/proj", "--mode", "projects"]
+            pid_file = _pid_file_for_cmd(cmd)
+            pid_file.parent.mkdir(parents=True, exist_ok=True)
+            pid_file.write_text("999999999")  # dead PID
+
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                mock_popen.return_value.pid = 4242
+                _spawn_mine(cmd)
+                mock_popen.assert_called_once()
+                # New PID is recorded in the reclaimed slot.
+                assert pid_file.read_text().strip() == "4242"
+
+
+def test_spawn_mine_releases_slot_on_oserror(tmp_path):
+    """If Popen raises OSError, the claimed slot must be released."""
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            from mempalace.hooks_cli import _pid_file_for_cmd, _spawn_mine
+
+            cmd = ["mempalace", "mine", "/tmp/proj", "--mode", "projects"]
+            pid_file = _pid_file_for_cmd(cmd)
+
+            with patch("mempalace.hooks_cli.subprocess.Popen", side_effect=OSError("spawn fail")):
+                with pytest.raises(OSError):
+                    _spawn_mine(cmd)
+                assert (
+                    not pid_file.exists()
+                ), "slot must be released so the next hook fire isn't permanently blocked"
+
+
+def test_spawn_mine_passes_pid_file_env_var(tmp_path):
+    """The child inherits MEMPALACE_MINE_PID_FILE so its cleanup hook can find the slot."""
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                mock_popen.return_value.pid = 5555
+                from mempalace.hooks_cli import _pid_file_for_cmd, _spawn_mine
+
+                cmd = ["mempalace", "mine", "/tmp/x", "--mode", "projects"]
+                _spawn_mine(cmd)
+                child_env = mock_popen.call_args.kwargs.get("env", {})
+                expected = str(_pid_file_for_cmd(cmd))
+                assert child_env.get("MEMPALACE_MINE_PID_FILE") == expected
+
+
 def test_ingest_transcript_uses_detached_kwargs(tmp_path):
     """_ingest_transcript spawns the convos mine with detach kwargs."""
     transcript = tmp_path / "session.jsonl"
     transcript.write_text("x" * 200)  # > 100 byte gate
     with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
-        with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
-            from mempalace.hooks_cli import _ingest_transcript
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
+            with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                from mempalace.hooks_cli import _ingest_transcript
+
+                _ingest_transcript(str(transcript))
+                assert mock_popen.called
+                kwargs = mock_popen.call_args.kwargs
+                assert kwargs.get("stdin") is subprocess.DEVNULL
+                assert kwargs.get("close_fds") is True
+
+
+def test_ingest_transcript_skips_when_target_running(tmp_path):
+    """Repeated transcript ingests for the same transcript should dedup."""
+    transcript = tmp_path / "session.jsonl"
+    transcript.write_text("x" * 200)
+    pid_dir = tmp_path / "mine_pids"
+    with patch("mempalace.hooks_cli.STATE_DIR", tmp_path):
+        with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+            with patch("mempalace.hooks_cli._mempalace_python", return_value=sys.executable):
+                from mempalace.hooks_cli import _ingest_transcript, _pid_file_for_cmd
+
+                expected_cmd = [
+                    sys.executable,
+                    "-m",
+                    "mempalace",
+                    "mine",
+                    str(transcript.parent),
+                    "--mode",
+                    "convos",
+                    "--wing",
+                    "sessions",
+                ]
+                pid_file = _pid_file_for_cmd(expected_cmd)
+                pid_file.parent.mkdir(parents=True, exist_ok=True)
+                pid_file.write_text(str(os.getpid()))  # live target
 
-            _ingest_transcript(str(transcript))
-            assert mock_popen.called
-            kwargs = mock_popen.call_args.kwargs
-            assert kwargs.get("stdin") is subprocess.DEVNULL
-            assert kwargs.get("close_fds") is True
+                with patch("mempalace.hooks_cli.subprocess.Popen") as mock_popen:
+                    _ingest_transcript(str(transcript))
+                    mock_popen.assert_not_called()
 
 
 # --- _mine_already_running ---
 
 
+def _seed_slot(pid_dir, cmd, body: str):
+    """Write ``body`` into the per-target slot for ``cmd`` under ``pid_dir``."""
+    from mempalace.hooks_cli import _pid_file_for_cmd
+
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+        slot = _pid_file_for_cmd(cmd)
+    slot.parent.mkdir(parents=True, exist_ok=True)
+    slot.write_text(body)
+    return slot
+
+
 def test_mine_already_running_no_file(tmp_path):
-    """Returns False when no PID file exists."""
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", tmp_path / "mine.pid"):
-        assert _mine_already_running() is False
+    """Returns False when no per-target slot exists."""
+    cmd = ["mempalace", "mine", "/tmp/x", "--mode", "projects"]
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", tmp_path / "mine_pids"):
+        assert _mine_already_running(cmd) is False
 
 
 def test_mine_already_running_dead_pid(tmp_path):
-    """Returns False when PID file contains a PID that no longer exists."""
-    pid_file = tmp_path / "mine.pid"
-    pid_file.write_text("999999999")  # almost certainly not a real PID
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
-        assert _mine_already_running() is False
+    """Returns False when the slot's recorded PID is no longer alive."""
+    pid_dir = tmp_path / "mine_pids"
+    cmd = ["mempalace", "mine", "/tmp/x", "--mode", "projects"]
+    _seed_slot(pid_dir, cmd, "999999999")  # almost certainly not a real PID
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+        assert _mine_already_running(cmd) is False
 
 
 def test_mine_already_running_live_pid(tmp_path):
-    """Returns True when PID file contains the current process's own PID."""
-    pid_file = tmp_path / "mine.pid"
-    pid_file.write_text(str(os.getpid()))  # current process is definitely alive
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
-        assert _mine_already_running() is True
+    """Returns True when the slot's recorded PID is alive."""
+    pid_dir = tmp_path / "mine_pids"
+    cmd = ["mempalace", "mine", "/tmp/x", "--mode", "projects"]
+    _seed_slot(pid_dir, cmd, str(os.getpid()))  # current process is alive
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+        assert _mine_already_running(cmd) is True
 
 
 def test_mine_already_running_corrupt_file(tmp_path):
-    """Returns False when PID file contains non-integer content."""
-    pid_file = tmp_path / "mine.pid"
-    pid_file.write_text("not-a-pid")
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
-        assert _mine_already_running() is False
+    """Returns False when the slot contains non-integer content."""
+    pid_dir = tmp_path / "mine_pids"
+    cmd = ["mempalace", "mine", "/tmp/x", "--mode", "projects"]
+    _seed_slot(pid_dir, cmd, "not-a-pid")
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+        assert _mine_already_running(cmd) is False
+
+
+def test_mine_already_running_distinct_cmds_independent(tmp_path):
+    """Slots are keyed per cmd; an alive entry for cmd A doesn't shadow cmd B."""
+    pid_dir = tmp_path / "mine_pids"
+    cmd_a = ["mempalace", "mine", "/tmp/a", "--mode", "projects"]
+    cmd_b = ["mempalace", "mine", "/tmp/b", "--mode", "projects"]
+    _seed_slot(pid_dir, cmd_a, str(os.getpid()))
+    with patch("mempalace.hooks_cli._MINE_PID_DIR", pid_dir):
+        assert _mine_already_running(cmd_a) is True
+        assert _mine_already_running(cmd_b) is False
 
 
 # --- _get_mine_targets ---
diff --git a/tests/test_miner.py b/tests/test_miner.py
index 10dd33d..f9c4722 100644
--- a/tests/test_miner.py
+++ b/tests/test_miner.py
@@ -777,7 +777,7 @@ def fake_process_file(*args, **kwargs):
 
 
 def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
-    """Our own PID entry in mine.pid is removed in the finally clause."""
+    """Our own per-target PID slot is removed in the finally clause."""
     import pytest
     from unittest.mock import patch
 
@@ -786,14 +786,16 @@ def test_mine_cleans_up_pid_file_on_interrupt(tmp_path):
     _make_minable_project(project_root, n_files=2)
     palace_path = project_root / "palace"
 
-    pid_file = tmp_path / "mine.pid"
+    pid_file = tmp_path / "mine_abc.pid"
     pid_file.write_text(str(os.getpid()))
 
     def fake_process_file(*args, **kwargs):
         raise KeyboardInterrupt
 
+    # The mine subprocess receives its slot path via env var; the cleanup
+    # hook in miner.py reads that var and removes the slot if it matches.
     with (
-        patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file),
+        patch.dict(os.environ, {"MEMPALACE_MINE_PID_FILE": str(pid_file)}),
         patch("mempalace.miner.process_file", side_effect=fake_process_file),
     ):
         with pytest.raises(SystemExit):
@@ -803,7 +805,7 @@ def fake_process_file(*args, **kwargs):
 
 
 def test_mine_cleans_up_pid_file_on_clean_exit(tmp_path):
-    """Successful mine also removes its own PID entry in the finally clause."""
+    """Successful mine also removes its own per-target PID slot."""
     from unittest.mock import patch
 
     project_root = tmp_path / "proj"
@@ -811,17 +813,17 @@ def test_mine_cleans_up_pid_file_on_clean_exit(tmp_path):
     _make_minable_project(project_root, n_files=1)
     palace_path = project_root / "palace"
 
-    pid_file = tmp_path / "mine.pid"
+    pid_file = tmp_path / "mine_abc.pid"
     pid_file.write_text(str(os.getpid()))
 
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
+    with patch.dict(os.environ, {"MEMPALACE_MINE_PID_FILE": str(pid_file)}):
         mine(str(project_root), str(palace_path))
 
     assert not pid_file.exists()
 
 
 def test_mine_does_not_remove_other_processes_pid_file(tmp_path):
-    """A PID file pointing at someone else's PID is left untouched."""
+    """A PID slot pointing at someone else's PID is left untouched."""
     from unittest.mock import patch
 
     project_root = tmp_path / "proj"
@@ -830,10 +832,10 @@ def test_mine_does_not_remove_other_processes_pid_file(tmp_path):
     palace_path = project_root / "palace"
 
     other_pid = os.getpid() + 999_999  # a PID that isn't us
-    pid_file = tmp_path / "mine.pid"
+    pid_file = tmp_path / "mine_abc.pid"
     pid_file.write_text(str(other_pid))
 
-    with patch("mempalace.hooks_cli._MINE_PID_FILE", pid_file):
+    with patch.dict(os.environ, {"MEMPALACE_MINE_PID_FILE": str(pid_file)}):
         mine(str(project_root), str(palace_path))
 
     assert pid_file.exists(), "Foreign PID entries must not be removed"

From d4c476b7d3fa4e232e94cf687b19c86d6c8b9336 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 8 May 2026 02:16:07 -0300
Subject: [PATCH 106/127] fix(hooks): use Optional[Path] for py39 compat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PEP 604 union syntax (Path | None) requires Python 3.10+. The project
still supports 3.9 (per pyproject target-version and CI matrix), and
this annotation lives in a function signature so it is evaluated at
module load time — failing with "unsupported operand type(s) for |"
on test-linux 3.9.

The other ``int | None`` annotation in this file is inside a function
body, where Python skips runtime evaluation of local annotations, so
it does not trip 3.9.
---
 mempalace/hooks_cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mempalace/hooks_cli.py b/mempalace/hooks_cli.py
index f7753a0..b8d07e0 100644
--- a/mempalace/hooks_cli.py
+++ b/mempalace/hooks_cli.py
@@ -14,6 +14,7 @@
 import sys
 from datetime import datetime
 from pathlib import Path
+from typing import Optional
 
 SAVE_INTERVAL = 15
 STATE_DIR = Path.home() / ".mempalace" / "hook_state"
@@ -343,7 +344,7 @@ def _mine_already_running(cmd: list[str]) -> bool:
     return _pid_alive(int(recorded))
 
 
-def _claim_mine_slot(cmd: list[str]) -> Path | None:
+def _claim_mine_slot(cmd: list[str]) -> Optional[Path]:
     """Atomically reserve the per-target PID slot for ``cmd``.
 
     Returns the slot path on success, or ``None`` if the target is

From eebf48e9752594e999fa7e8818420b712e1fbf6f Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 06:29:36 +0000
Subject: [PATCH 107/127] fix(kg): accept ISO datetimes for temporal inputs

---
 mempalace/config.py      |  84 ++++++++++++++++++++++++-------
 mempalace/mcp_server.py  |  17 ++++---
 tests/test_config.py     |  58 +++++++++++++++++++++
 tests/test_mcp_server.py | 105 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 238 insertions(+), 26 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index fd32a17..19b0b61 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -7,6 +7,7 @@
 import json
 import os
 import re
+from datetime import date, datetime
 from functools import lru_cache
 from pathlib import Path
 
@@ -82,38 +83,85 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
     return value
 
 
-# ISO-8601 date validator for knowledge-graph temporal parameters
-# (as_of, valid_from, valid_to, ended). Parameterized queries already
-# prevent SQL injection, but unvalidated date strings silently miss
-# every row — callers cannot distinguish "no fact at this time" from
-# "your date format was unrecognized." Require full YYYY-MM-DD: KG
-# queries compare TEXT dates lexicographically, so partials like "2026"
-# would re-introduce silent empty results (e.g. "2026-01-01" <= "2026"
-# is False), defeating the purpose of validation.
+# ISO-8601 temporal validator for knowledge-graph temporal parameters
+# (as_of, valid_from, valid_to, ended).
+#
+# KG temporal values are stored as TEXT. We accept complete date and datetime
+# forms, but still reject partial dates like YYYY or YYYY-MM because those can
+# silently miss rows in lexicographic comparisons.
 _ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
 
+_ISO_DATETIME_RE = re.compile(
+    r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
+    r"[T ](?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
+    r"(?:\.\d+)?(?:Z|[+-](?:[01]\d|2[0-3]):[0-5]\d)?$"
+)
+
+
+def _validate_iso_temporal_calendar(value: str) -> None:
+    """Reject impossible calendar values after regex shape validation."""
+
+    if _ISO_DATE_RE.match(value):
+        date.fromisoformat(value)
+        return
+
+    if _ISO_DATETIME_RE.match(value):
+        # datetime.fromisoformat accepts "+00:00"; normalize the common
+        # ISO-8601 "Z" UTC suffix for Python versions that require it.
+        datetime.fromisoformat(value.replace("Z", "+00:00"))
+        return
+
+    raise ValueError
 
-def sanitize_iso_date(value, field_name: str = "date"):
-    """Validate an ISO-8601 date string, accepting None or empty as-is.
 
-    Accepts only ``YYYY-MM-DD``. Raises ValueError on any other
-    non-empty input so the MCP layer can surface a clear error to the
-    caller instead of silently returning empty results. Partial dates
-    (``YYYY``, ``YYYY-MM``) are rejected because KG queries compare
-    TEXT dates lexicographically and would silently exclude valid facts.
+def sanitize_iso_temporal(value, field_name: str = "date"):
+    """Validate an ISO-8601 date or datetime string.
+
+    Accepts ``None`` and ``""`` as pass-through values.
+
+    Accepted non-empty string forms:
+
+    - ``YYYY-MM-DD``
+    - ``YYYY-MM-DDTHH:MM:SS``
+    - ``YYYY-MM-DDTHH:MM:SS.fff``
+    - ``YYYY-MM-DDTHH:MM:SSZ``
+    - ``YYYY-MM-DDTHH:MM:SS±HH:MM``
+    - ``YYYY-MM-DD HH:MM:SS``
+
+    Partial dates such as ``YYYY`` and ``YYYY-MM`` are rejected because KG
+    queries compare TEXT temporal values lexicographically and partials can
+    silently exclude valid facts.
     """
+
     if value is None or value == "":
         return value
     if not isinstance(value, str):
         raise ValueError(f"{field_name} must be a string")
+
     value = value.strip()
-    if not _ISO_DATE_RE.match(value):
+
+    try:
+        _validate_iso_temporal_calendar(value)
+    except ValueError:
         raise ValueError(
-            f"{field_name}={value!r} is not a valid ISO-8601 date " f"(expected YYYY-MM-DD)"
-        )
+            f"{field_name}={value!r} is not a valid ISO-8601 date or datetime "
+            "(expected YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)"
+        ) from None
+
     return value
 
 
+def sanitize_iso_date(value, field_name: str = "date"):
+    """Backward-compatible wrapper for ISO temporal validation.
+
+    Historically this accepted only full dates. It now accepts full ISO
+    datetimes too, but the old name is kept so existing imports continue to
+    work.
+    """
+
+    return sanitize_iso_temporal(value, field_name)
+
+
 def sanitize_content(value: str, max_length: int = 100_000) -> str:
     """Validate drawer/diary content length."""
     if not isinstance(value, str) or not value.strip():
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 521cb07..5be2136 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -57,7 +57,7 @@
     sanitize_kg_value,
     sanitize_name,
     sanitize_content,
-    sanitize_iso_date,
+    sanitize_iso_temporal,
 )
 from .version import __version__  # noqa: E402
 from chromadb.errors import NotFoundError as _ChromaNotFoundError  # noqa: E402
@@ -1156,7 +1156,7 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
     """Query the knowledge graph for an entity's relationships."""
     try:
         entity = sanitize_kg_value(entity, "entity")
-        as_of = sanitize_iso_date(as_of, "as_of")
+        as_of = sanitize_iso_temporal(as_of, "as_of")
     except ValueError as e:
         return {"error": str(e)}
     if direction not in ("outgoing", "incoming", "both"):
@@ -1190,7 +1190,8 @@ def tool_kg_add(
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
-        valid_from = sanitize_iso_date(valid_from, "valid_from")
+        valid_from = sanitize_iso_temporal(valid_from, "valid_from")
+        valid_to = sanitize_iso_temporal(valid_to, "valid_to")
     except ValueError as e:
         return {"success": False, "error": str(e)}
 
@@ -1236,7 +1237,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
         subject = sanitize_kg_value(subject, "subject")
         predicate = sanitize_name(predicate, "predicate")
         object = sanitize_kg_value(object, "object")
-        ended = sanitize_iso_date(ended, "ended")
+        ended = sanitize_iso_temporal(ended, "ended")
     except ValueError as e:
         return {"success": False, "error": str(e)}
     resolved_ended = ended or date.today().isoformat()
@@ -1633,7 +1634,7 @@ def tool_reconnect():
                 },
                 "as_of": {
                     "type": "string",
-                    "description": "Date filter — only facts valid at this date (YYYY-MM-DD, optional)",
+                    "description": "Date/datetime filter — only facts valid at this time (YYYY-MM-DD or ISO datetime, optional)",
                 },
                 "direction": {
                     "type": "string",
@@ -1657,11 +1658,11 @@ def tool_reconnect():
                 "object": {"type": "string", "description": "The entity being connected to"},
                 "valid_from": {
                     "type": "string",
-                    "description": "When this became true (YYYY-MM-DD, optional)",
+                    "description": "When this became true (YYYY-MM-DD or ISO datetime, optional)",
                 },
                 "valid_to": {
                     "type": "string",
-                    "description": "When this stopped being true (YYYY-MM-DD, optional). Use for backfilling already-ended historical facts.",
+                    "description": "When this stopped being true (YYYY-MM-DD or ISO datetime, optional). Use for backfilling already-ended historical facts.",
                 },
                 "source_closet": {
                     "type": "string",
@@ -1690,7 +1691,7 @@ def tool_reconnect():
                 "object": {"type": "string", "description": "Connected entity"},
                 "ended": {
                     "type": "string",
-                    "description": "When it stopped being true (YYYY-MM-DD, default: today)",
+                    "description": "When it stopped being true (YYYY-MM-DD or ISO datetime, default: today)",
                 },
             },
             "required": ["subject", "predicate", "object"],
diff --git a/tests/test_config.py b/tests/test_config.py
index 204faae..d26a567 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -7,6 +7,7 @@
     MempalaceConfig,
     normalize_wing_name,
     sanitize_iso_date,
+    sanitize_iso_temporal,
     sanitize_kg_value,
     sanitize_name,
 )
@@ -284,3 +285,60 @@ def test_iso_date_rejects_non_string():
 def test_iso_date_error_names_field():
     with pytest.raises(ValueError, match="valid_from"):
         sanitize_iso_date("yesterday", "valid_from")
+
+
+def test_iso_temporal_accepts_full_datetime():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00") == "2026-05-06T14:23:00"
+
+
+def test_iso_temporal_accepts_fractional_seconds():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00.123") == "2026-05-06T14:23:00.123"
+
+
+def test_iso_temporal_accepts_utc_z_suffix():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00Z") == "2026-05-06T14:23:00Z"
+
+
+def test_iso_temporal_accepts_timezone_offset():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00+02:00") == "2026-05-06T14:23:00+02:00"
+
+
+def test_iso_temporal_accepts_negative_timezone_offset():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00-05:30") == "2026-05-06T14:23:00-05:30"
+
+
+def test_iso_temporal_accepts_sqlite_space_separator():
+    assert sanitize_iso_temporal("2026-05-06 14:23:00") == "2026-05-06 14:23:00"
+
+
+def test_iso_temporal_strips_datetime_whitespace():
+    assert sanitize_iso_temporal(" 2026-05-06T14:23:00Z ") == "2026-05-06T14:23:00Z"
+
+
+def test_iso_date_backward_compatible_wrapper_accepts_datetime():
+    assert sanitize_iso_date("2026-05-06T14:23:00Z") == "2026-05-06T14:23:00Z"
+
+
+def test_iso_temporal_rejects_datetime_without_seconds():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06T14:23")
+
+
+def test_iso_temporal_rejects_invalid_datetime_hour():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06T24:00:00")
+
+
+def test_iso_temporal_rejects_invalid_timezone_offset():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06T14:23:00+24:00")
+
+
+def test_iso_temporal_rejects_invalid_calendar_date():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-02-31")
+
+
+def test_iso_temporal_error_names_field():
+    with pytest.raises(ValueError, match="as_of"):
+        sanitize_iso_temporal("2026-05-06T14:23", "as_of")
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 1f47192..76b90ac 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -923,6 +923,111 @@ def test_kg_query_rejects_partial_iso_dates(self, monkeypatch, config, palace_pa
         result = tool_kg_query(entity="Max", as_of="2026-03-15")
         assert "error" not in result, f"rejected valid date: {result}"
 
+    def test_kg_add_accepts_datetime_valid_from(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_add(
+            "Alice",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:23:00Z",
+        )
+
+        assert result["success"] is True
+
+        facts = kg.query_entity("Alice", direction="outgoing")
+        fact = next(r for r in facts if r["predicate"] == "works_at" and r["object"] == "Acme")
+
+        assert fact["valid_from"] == "2026-05-06T14:23:00Z"
+
+    def test_kg_add_accepts_datetime_valid_to(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_add(
+            "Alice",
+            "worked_at",
+            "OldCo",
+            valid_from="2026-05-06T14:00:00Z",
+            valid_to="2026-05-06T15:00:00Z",
+        )
+
+        assert result["success"] is True
+
+        facts = kg.query_entity("Alice", direction="outgoing")
+        fact = next(r for r in facts if r["predicate"] == "worked_at" and r["object"] == "OldCo")
+
+        assert fact["valid_from"] == "2026-05-06T14:00:00Z"
+        assert fact["valid_to"] == "2026-05-06T15:00:00Z"
+
+    def test_kg_query_accepts_datetime_as_of(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        kg.add_triple(
+            "Alice",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:00:00Z",
+        )
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_query(
+            "Alice",
+            as_of="2026-05-06T14:23:00Z",
+            direction="outgoing",
+        )
+
+        assert "error" not in result
+        assert result["as_of"] == "2026-05-06T14:23:00Z"
+        assert result["count"] == 1
+        assert result["facts"][0]["object"] == "Acme"
+
+    def test_kg_invalidate_accepts_datetime_ended(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        kg.add_triple(
+            "Alice",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:00:00Z",
+        )
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_invalidate(
+            "Alice",
+            "works_at",
+            "Acme",
+            ended="2026-05-06T14:23:00Z",
+        )
+
+        assert result["success"] is True
+        assert result["ended"] == "2026-05-06T14:23:00Z"
+
+        facts = kg.query_entity("Alice", direction="outgoing")
+        fact = next(r for r in facts if r["predicate"] == "works_at" and r["object"] == "Acme")
+
+        assert fact["valid_to"] == "2026-05-06T14:23:00Z"
+
+    def test_kg_query_rejects_partial_datetime(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_query(
+            "Alice",
+            as_of="2026-05-06T14:23",
+            direction="outgoing",
+        )
+
+        assert "error" in result
+        assert "as_of" in result["error"]
+        assert "ISO-8601 date or datetime" in result["error"]
+
 
 # ── Diary Tools ─────────────────────────────────────────────────────────
 

From 4adc99f75588481bbc99728a8c54af367d1b3cb2 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 08:32:25 +0000
Subject: [PATCH 108/127] fix(kg): canonicalize temporal datetime support

---
 mempalace/config.py           |  44 ++++++------
 mempalace/knowledge_graph.py  | 129 ++++++++++++++++++++++++++--------
 mempalace/mcp_server.py       |  36 +++++-----
 tests/test_config.py          |  45 ++++++------
 tests/test_knowledge_graph.py | 128 +++++++++++++++++++++++++++++++++
 tests/test_mcp_server.py      |  98 ++++++++++++++++++++++++--
 6 files changed, 384 insertions(+), 96 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index 19b0b61..608a938 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -86,15 +86,21 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
 # ISO-8601 temporal validator for knowledge-graph temporal parameters
 # (as_of, valid_from, valid_to, ended).
 #
-# KG temporal values are stored as TEXT. We accept complete date and datetime
-# forms, but still reject partial dates like YYYY or YYYY-MM because those can
-# silently miss rows in lexicographic comparisons.
+# The KG stores temporal values as TEXT. Lexicographic comparisons are only
+# safe when all datetime values use one canonical shape. Accept full dates
+# for legacy compatibility and exact UTC datetimes for sub-day precision.
+#
+# Accepted:
+#   YYYY-MM-DD
+#   YYYY-MM-DDTHH:MM:SSZ
+#
+# Rejected:
+#   partial dates, naive datetimes, timezone offsets, fractional seconds,
+#   and SQLite-style space-separated datetimes.
 _ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
 
-_ISO_DATETIME_RE = re.compile(
-    r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
-    r"[T ](?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d"
-    r"(?:\.\d+)?(?:Z|[+-](?:[01]\d|2[0-3]):[0-5]\d)?$"
+_ISO_UTC_DATETIME_RE = re.compile(
+    r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])" r"T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\dZ$"
 )
 
 
@@ -105,9 +111,7 @@ def _validate_iso_temporal_calendar(value: str) -> None:
         date.fromisoformat(value)
         return
 
-    if _ISO_DATETIME_RE.match(value):
-        # datetime.fromisoformat accepts "+00:00"; normalize the common
-        # ISO-8601 "Z" UTC suffix for Python versions that require it.
+    if _ISO_UTC_DATETIME_RE.match(value):
         datetime.fromisoformat(value.replace("Z", "+00:00"))
         return
 
@@ -115,22 +119,18 @@ def _validate_iso_temporal_calendar(value: str) -> None:
 
 
 def sanitize_iso_temporal(value, field_name: str = "date"):
-    """Validate an ISO-8601 date or datetime string.
+    """Validate an ISO-8601 date or canonical UTC datetime string.
 
     Accepts ``None`` and ``""`` as pass-through values.
 
     Accepted non-empty string forms:
 
     - ``YYYY-MM-DD``
-    - ``YYYY-MM-DDTHH:MM:SS``
-    - ``YYYY-MM-DDTHH:MM:SS.fff``
     - ``YYYY-MM-DDTHH:MM:SSZ``
-    - ``YYYY-MM-DDTHH:MM:SS±HH:MM``
-    - ``YYYY-MM-DD HH:MM:SS``
 
-    Partial dates such as ``YYYY`` and ``YYYY-MM`` are rejected because KG
-    queries compare TEXT temporal values lexicographically and partials can
-    silently exclude valid facts.
+    Partial dates are rejected because KG queries compare TEXT temporal values.
+    Non-canonical datetime forms are rejected because mixed temporal string
+    formats can silently return wrong KG query results.
     """
 
     if value is None or value == "":
@@ -144,8 +144,8 @@ def sanitize_iso_temporal(value, field_name: str = "date"):
         _validate_iso_temporal_calendar(value)
     except ValueError:
         raise ValueError(
-            f"{field_name}={value!r} is not a valid ISO-8601 date or datetime "
-            "(expected YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)"
+            f"{field_name}={value!r} is not a valid ISO-8601 date or UTC datetime "
+            "(expected YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ)"
         ) from None
 
     return value
@@ -154,8 +154,8 @@ def sanitize_iso_temporal(value, field_name: str = "date"):
 def sanitize_iso_date(value, field_name: str = "date"):
     """Backward-compatible wrapper for ISO temporal validation.
 
-    Historically this accepted only full dates. It now accepts full ISO
-    datetimes too, but the old name is kept so existing imports continue to
+    Historically this accepted only full dates. It now also accepts canonical
+    UTC datetimes, but the old name is kept so existing imports continue to
     work.
     """
 
diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 30055a1..3548502 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -42,11 +42,58 @@
 import threading
 from datetime import date, datetime
 from pathlib import Path
+from .config import sanitize_iso_temporal
 
 
 DEFAULT_KG_PATH = os.path.expanduser("~/.mempalace/knowledge_graph.sqlite3")
 
 
+def _is_date_only_temporal(value: str) -> bool:
+    return isinstance(value, str) and len(value) == 10 and value[4] == "-" and value[7] == "-"
+
+
+def _temporal_start_key(value: str | None) -> str | None:
+    """Return the comparable instant for a valid_from/as_of value."""
+
+    if value is None:
+        return None
+
+    if _is_date_only_temporal(value):
+        return f"{value}T00:00:00Z"
+
+    return value
+
+
+def _temporal_end_key(value: str | None) -> str | None:
+    """Return the comparable instant for a valid_to value.
+
+    Date-only valid_to values represent the whole day for backward
+    compatibility with existing KG facts.
+    """
+
+    if value is None:
+        return None
+
+    if _is_date_only_temporal(value):
+        return f"{value}T23:59:59Z"
+
+    return value
+
+
+def _triple_valid_at(valid_from: str | None, valid_to: str | None, as_of: str) -> bool:
+    as_of_key = _temporal_start_key(as_of)
+    valid_from_key = _temporal_start_key(valid_from)
+    valid_to_key = _temporal_end_key(valid_to)
+
+    if valid_from_key is not None and valid_from_key > as_of_key:
+        return False
+
+    if valid_to_key is not None and valid_to_key < as_of_key:
+        return False
+
+    return True
+
+
 class KnowledgeGraph:
     def __init__(self, db_path: str = None):
         self.db_path = db_path or DEFAULT_KG_PATH
@@ -171,10 +218,17 @@ def add_triple(
             add_triple("Max", "does", "swimming", valid_from="2025-01-01")
             add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
         """
-        # Reject inverted intervals: a triple with valid_to < valid_from
-        # would never satisfy `valid_from <= as_of AND valid_to >= as_of`,
-        # so it would be invisible to every query — silently corrupt.
-        if valid_from is not None and valid_to is not None and valid_to < valid_from:
+        valid_from = sanitize_iso_temporal(valid_from, "valid_from")
+        valid_to = sanitize_iso_temporal(valid_to, "valid_to")
+
+        # Reject inverted intervals. Use temporal comparison keys rather than raw
+        # string comparison so legacy date-only values and canonical UTC datetimes can
+        # safely coexist.
+        if (
+            valid_from is not None
+            and valid_to is not None
+            and _temporal_end_key(valid_to) < _temporal_start_key(valid_from)
+        ):
             raise ValueError(
                 f"valid_to={valid_to!r} is before valid_from={valid_from!r}; "
                 "an inverted interval would be invisible to every KG query"
@@ -230,17 +284,34 @@ def add_triple(
         return triple_id
 
     def invalidate(self, subject: str, predicate: str, obj: str, ended: str = None):
-        """Mark a relationship as no longer valid (set valid_to date)."""
+        """Mark a relationship as no longer valid (set valid_to date/time)."""
         sub_id = self._entity_id(subject)
         obj_id = self._entity_id(obj)
         pred = predicate.lower().replace(" ", "_")
-        ended = ended or date.today().isoformat()
+        ended = sanitize_iso_temporal(ended or date.today().isoformat(), "ended")
 
         with self._lock:
             conn = self._conn()
             with conn:
+                rows = conn.execute(
+                    "SELECT id, valid_from FROM triples "
+                    "WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
+                    (sub_id, pred, obj_id),
+                ).fetchall()
+
+                for row in rows:
+                    valid_from = row["valid_from"]
+                    if valid_from is not None and _temporal_end_key(ended) < _temporal_start_key(
+                        valid_from
+                    ):
+                        raise ValueError(
+                            f"valid_to={ended!r} is before valid_from={valid_from!r}; "
+                            "an inverted interval would be invisible to every KG query"
+                        )
+
                 conn.execute(
-                    "UPDATE triples SET valid_to=? WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
+                    "UPDATE triples SET valid_to=? "
+                    "WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
                     (ended, sub_id, pred, obj_id),
                 )
 
@@ -251,21 +322,23 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
         Get all relationships for an entity.
 
         direction: "outgoing" (entity → ?), "incoming" (? → entity), "both"
-        as_of: date string — only return facts valid at that time
+        as_of: ISO date or canonical UTC datetime — only return facts valid then
         """
+        as_of = sanitize_iso_temporal(as_of, "as_of")
         eid = self._entity_id(name)
-
         results = []
         with self._lock:
             conn = self._conn()
 
             if direction in ("outgoing", "both"):
-                query = "SELECT t.*, e.name as obj_name FROM triples t JOIN entities e ON t.object = e.id WHERE t.subject = ?"
-                params = [eid]
-                if as_of:
-                    query += " AND (t.valid_from IS NULL OR t.valid_from <= ?) AND (t.valid_to IS NULL OR t.valid_to >= ?)"
-                    params.extend([as_of, as_of])
-                for row in conn.execute(query, params).fetchall():
+                query = (
+                    "SELECT t.*, e.name as obj_name FROM triples t "
+                    "JOIN entities e ON t.object = e.id WHERE t.subject = ?"
+                )
+                for row in conn.execute(query, [eid]).fetchall():
+                    if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
+                        continue
+
                     results.append(
                         {
                             "direction": "outgoing",
@@ -281,12 +354,14 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
                     )
 
             if direction in ("incoming", "both"):
-                query = "SELECT t.*, e.name as sub_name FROM triples t JOIN entities e ON t.subject = e.id WHERE t.object = ?"
-                params = [eid]
-                if as_of:
-                    query += " AND (t.valid_from IS NULL OR t.valid_from <= ?) AND (t.valid_to IS NULL OR t.valid_to >= ?)"
-                    params.extend([as_of, as_of])
-                for row in conn.execute(query, params).fetchall():
+                query = (
+                    "SELECT t.*, e.name as sub_name FROM triples t "
+                    "JOIN entities e ON t.subject = e.id WHERE t.object = ?"
+                )
+                for row in conn.execute(query, [eid]).fetchall():
+                    if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
+                        continue
+
                     results.append(
                         {
                             "direction": "incoming",
@@ -300,11 +375,11 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
                             "current": row["valid_to"] is None,
                         }
                     )
-
         return results
 
     def query_relationship(self, predicate: str, as_of: str = None):
         """Get all triples with a given relationship type."""
+        as_of = sanitize_iso_temporal(as_of, "as_of")
         pred = predicate.lower().replace(" ", "_")
         query = """
             SELECT t.*, s.name as sub_name, o.name as obj_name
@@ -313,15 +388,13 @@ def query_relationship(self, predicate: str, as_of: str = None):
             JOIN entities o ON t.object = o.id
             WHERE t.predicate = ?
         """
-        params = [pred]
-        if as_of:
-            query += " AND (t.valid_from IS NULL OR t.valid_from <= ?) AND (t.valid_to IS NULL OR t.valid_to >= ?)"
-            params.extend([as_of, as_of])
-
         results = []
         with self._lock:
             conn = self._conn()
-            for row in conn.execute(query, params).fetchall():
+            for row in conn.execute(query, [pred]).fetchall():
+                if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
+                    continue
+
                 results.append(
                     {
                         "subject": row["sub_name"],
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 5be2136..a31385f 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -1159,8 +1159,10 @@ def tool_kg_query(entity: str, as_of: str = None, direction: str = "both"):
         as_of = sanitize_iso_temporal(as_of, "as_of")
     except ValueError as e:
         return {"error": str(e)}
+
     if direction not in ("outgoing", "incoming", "both"):
         return {"error": "direction must be 'outgoing', 'incoming', or 'both'"}
+
     results = _call_kg(lambda kg: kg.query_entity(entity, as_of=as_of, direction=direction))
     return {"entity": entity, "as_of": as_of, "facts": results, "count": len(results)}
 
@@ -1178,13 +1180,11 @@ def tool_kg_add(
     """Add a relationship to the knowledge graph.
 
     All temporal and provenance fields are optional. ``valid_to`` lets callers
-    backfill historical facts with a known end date in a single call (instead
-    of a separate ``kg_invalidate``). ``source_file`` and ``source_drawer_id``
-    are RFC 002 provenance fields populated by adapters / bulk importers.
+    backfill historical facts with a known end date/time in a single call
+    instead of a separate ``kg_invalidate`` call.
 
-    TODO(#1283): once the ISO-8601 validation PR lands, wire ``validate_iso_date``
-    over ``valid_from`` / ``valid_to`` here so malformed dates fail fast at the
-    MCP boundary instead of silently producing empty query results.
+    Temporal values accept either ``YYYY-MM-DD`` or canonical UTC datetimes in
+    the form ``YYYY-MM-DDTHH:MM:SSZ``.
     """
     try:
         subject = sanitize_kg_value(subject, "subject")
@@ -1208,6 +1208,7 @@ def tool_kg_add(
             "source_drawer_id": source_drawer_id,
         },
     )
+
     triple_id = _call_kg(
         lambda kg: kg.add_triple(
             subject,
@@ -1224,14 +1225,14 @@ def tool_kg_add(
 
 
 def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = None):
-    """Mark a fact as no longer true (set end date).
+    """Mark a fact as no longer true.
 
-    Returns the actual ``ended`` date that was stored — when the caller omits
-    ``ended``, the underlying graph stamps ``date.today()``, and the response
-    reflects that resolved value (instead of the literal string ``"today"``)
-    so callers can verify what was persisted.
+    Returns the actual ``ended`` date/time that was stored. When the caller
+    omits ``ended``, the underlying graph stamps ``date.today()`` and the
+    response reflects that resolved value.
 
-    TODO(#1283): apply ``validate_iso_date`` to ``ended`` once that PR lands.
+    Temporal values accept either ``YYYY-MM-DD`` or canonical UTC datetimes in
+    the form ``YYYY-MM-DDTHH:MM:SSZ``.
     """
     try:
         subject = sanitize_kg_value(subject, "subject")
@@ -1240,7 +1241,9 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
         ended = sanitize_iso_temporal(ended, "ended")
     except ValueError as e:
         return {"success": False, "error": str(e)}
+
     resolved_ended = ended or date.today().isoformat()
+
     _wal_log(
         "kg_invalidate",
         {
@@ -1250,6 +1253,7 @@ def tool_kg_invalidate(subject: str, predicate: str, object: str, ended: str = N
             "ended": resolved_ended,
         },
     )
+
     _call_kg(lambda kg: kg.invalidate(subject, predicate, object, ended=resolved_ended))
     return {
         "success": True,
@@ -1634,7 +1638,7 @@ def tool_reconnect():
                 },
                 "as_of": {
                     "type": "string",
-                    "description": "Date/datetime filter — only facts valid at this time (YYYY-MM-DD or ISO datetime, optional)",
+                    "description": "Date/datetime filter — only facts valid at this time (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ, optional)",
                 },
                 "direction": {
                     "type": "string",
@@ -1658,11 +1662,11 @@ def tool_reconnect():
                 "object": {"type": "string", "description": "The entity being connected to"},
                 "valid_from": {
                     "type": "string",
-                    "description": "When this became true (YYYY-MM-DD or ISO datetime, optional)",
+                    "description": "When this became true (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ, optional)",
                 },
                 "valid_to": {
                     "type": "string",
-                    "description": "When this stopped being true (YYYY-MM-DD or ISO datetime, optional). Use for backfilling already-ended historical facts.",
+                    "description": "When this stopped being true (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ, optional). Use for backfilling already-ended historical facts.",
                 },
                 "source_closet": {
                     "type": "string",
@@ -1691,7 +1695,7 @@ def tool_reconnect():
                 "object": {"type": "string", "description": "Connected entity"},
                 "ended": {
                     "type": "string",
-                    "description": "When it stopped being true (YYYY-MM-DD or ISO datetime, default: today)",
+                    "description": "When it stopped being true (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ, default: today)",
                 },
             },
             "required": ["subject", "predicate", "object"],
diff --git a/tests/test_config.py b/tests/test_config.py
index d26a567..93dacf3 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -287,30 +287,14 @@ def test_iso_date_error_names_field():
         sanitize_iso_date("yesterday", "valid_from")
 
 
-def test_iso_temporal_accepts_full_datetime():
-    assert sanitize_iso_temporal("2026-05-06T14:23:00") == "2026-05-06T14:23:00"
+def test_iso_temporal_accepts_full_date():
+    assert sanitize_iso_temporal("2026-05-06") == "2026-05-06"
 
 
-def test_iso_temporal_accepts_fractional_seconds():
-    assert sanitize_iso_temporal("2026-05-06T14:23:00.123") == "2026-05-06T14:23:00.123"
-
-
-def test_iso_temporal_accepts_utc_z_suffix():
+def test_iso_temporal_accepts_canonical_utc_datetime():
     assert sanitize_iso_temporal("2026-05-06T14:23:00Z") == "2026-05-06T14:23:00Z"
 
 
-def test_iso_temporal_accepts_timezone_offset():
-    assert sanitize_iso_temporal("2026-05-06T14:23:00+02:00") == "2026-05-06T14:23:00+02:00"
-
-
-def test_iso_temporal_accepts_negative_timezone_offset():
-    assert sanitize_iso_temporal("2026-05-06T14:23:00-05:30") == "2026-05-06T14:23:00-05:30"
-
-
-def test_iso_temporal_accepts_sqlite_space_separator():
-    assert sanitize_iso_temporal("2026-05-06 14:23:00") == "2026-05-06 14:23:00"
-
-
 def test_iso_temporal_strips_datetime_whitespace():
     assert sanitize_iso_temporal(" 2026-05-06T14:23:00Z ") == "2026-05-06T14:23:00Z"
 
@@ -324,14 +308,29 @@ def test_iso_temporal_rejects_datetime_without_seconds():
         sanitize_iso_temporal("2026-05-06T14:23")
 
 
-def test_iso_temporal_rejects_invalid_datetime_hour():
+def test_iso_temporal_rejects_naive_datetime():
     with pytest.raises(ValueError):
-        sanitize_iso_temporal("2026-05-06T24:00:00")
+        sanitize_iso_temporal("2026-05-06T14:23:00")
 
 
-def test_iso_temporal_rejects_invalid_timezone_offset():
+def test_iso_temporal_rejects_fractional_seconds():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06T14:23:00.123Z")
+
+
+def test_iso_temporal_rejects_timezone_offset():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06T14:23:00+02:00")
+
+
+def test_iso_temporal_rejects_space_separator():
+    with pytest.raises(ValueError):
+        sanitize_iso_temporal("2026-05-06 14:23:00")
+
+
+def test_iso_temporal_rejects_invalid_datetime_hour():
     with pytest.raises(ValueError):
-        sanitize_iso_temporal("2026-05-06T14:23:00+24:00")
+        sanitize_iso_temporal("2026-05-06T24:00:00Z")
 
 
 def test_iso_temporal_rejects_invalid_calendar_date():
diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py
index 6eeb8d3..7f02253 100644
--- a/tests/test_knowledge_graph.py
+++ b/tests/test_knowledge_graph.py
@@ -171,3 +171,131 @@ def test_stats_seeded(self, seeded_kg):
         assert stats["triples"] == 5
         assert stats["current_facts"] == 4  # 1 expired (Acme Corp)
         assert stats["expired_facts"] == 1
+
+
+class TestTemporalDateTimeCompatibility:
+    def test_datetime_query_matches_legacy_date_only_fact(self, kg):
+        kg.add_triple(
+            "Alice",
+            "ate_at",
+            "Cafe",
+            valid_from="2026-05-06",
+            valid_to="2026-05-06",
+        )
+
+        result = kg.query_entity("Alice", as_of="2026-05-06T15:00:00Z")
+
+        assert len(result) == 1
+        assert result[0]["object"] == "Cafe"
+
+    def test_datetime_query_before_legacy_date_only_fact_does_not_match(self, kg):
+        kg.add_triple(
+            "Alice",
+            "ate_at",
+            "Cafe",
+            valid_from="2026-05-06",
+            valid_to="2026-05-06",
+        )
+
+        result = kg.query_entity("Alice", as_of="2026-05-05T23:59:59Z")
+
+        assert result == []
+
+    def test_datetime_query_after_legacy_date_only_fact_does_not_match(self, kg):
+        kg.add_triple(
+            "Alice",
+            "ate_at",
+            "Cafe",
+            valid_from="2026-05-06",
+            valid_to="2026-05-06",
+        )
+
+        result = kg.query_entity("Alice", as_of="2026-05-07T00:00:00Z")
+
+        assert result == []
+
+    def test_rejects_timezone_offset_datetime_at_kg_layer(self, kg):
+        with pytest.raises(ValueError):
+            kg.add_triple(
+                "Bob",
+                "works_at",
+                "Globex",
+                valid_from="2026-05-06T20:30:00-05:00",
+            )
+
+    def test_rejects_naive_datetime_at_kg_layer(self, kg):
+        with pytest.raises(ValueError):
+            kg.add_triple(
+                "Carol",
+                "is_in",
+                "NYC",
+                valid_from="2026-05-07T01:23:00",
+            )
+
+    def test_rejects_space_separated_datetime_at_kg_layer(self, kg):
+        with pytest.raises(ValueError):
+            kg.add_triple(
+                "Eve",
+                "is_in",
+                "London",
+                valid_from="2026-05-06T15:00:00Z",
+                valid_to="2026-05-06 20:00:00",
+            )
+
+    def test_date_only_valid_to_is_end_of_day_for_interval_check(self, kg):
+        kg.add_triple(
+            "Eve",
+            "is_in",
+            "London",
+            valid_from="2026-05-06T15:00:00Z",
+            valid_to="2026-05-06",
+        )
+
+        result = kg.query_entity("Eve", as_of="2026-05-06T20:00:00Z")
+
+        assert len(result) == 1
+        assert result[0]["object"] == "London"
+
+    def test_rejects_interval_when_date_only_end_is_before_datetime_start(self, kg):
+        with pytest.raises(
+            ValueError,
+            match=r"valid_to='2026-05-06'.*valid_from='2026-05-07T01:00:00Z'",
+        ):
+            kg.add_triple(
+                "Eve",
+                "is_in",
+                "London",
+                valid_from="2026-05-07T01:00:00Z",
+                valid_to="2026-05-06",
+            )
+
+    def test_query_relationship_uses_safe_temporal_comparison(self, kg):
+        kg.add_triple(
+            "Alice",
+            "visited",
+            "Cafe",
+            valid_from="2026-05-06",
+            valid_to="2026-05-06",
+        )
+
+        result = kg.query_relationship("visited", as_of="2026-05-06T15:00:00Z")
+
+        assert len(result) == 1
+        assert result[0]["subject"] == "Alice"
+        assert result[0]["object"] == "Cafe"
+
+    def test_invalidate_rejects_timezone_offset_ended(self, kg):
+        kg.add_triple(
+            "Alice",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:00:00Z",
+        )
+
+        with pytest.raises(ValueError):
+            kg.invalidate(
+                "Alice",
+                "works_at",
+                "Acme",
+                ended="2026-05-06T20:30:00-05:00",
+            )
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 76b90ac..575b42a 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -1013,20 +1013,104 @@ def test_kg_invalidate_accepts_datetime_ended(self, monkeypatch, config, palace_
 
         assert fact["valid_to"] == "2026-05-06T14:23:00Z"
 
-    def test_kg_query_rejects_partial_datetime(self, monkeypatch, config, palace_path, kg):
+    def test_kg_add_rejects_non_canonical_datetimes(self, monkeypatch, config, palace_path, kg):
         _patch_mcp_server(monkeypatch, config, kg)
 
         from mempalace import mcp_server
 
-        result = mcp_server.tool_kg_query(
+        invalid_values = [
+            "2026-05-06T14:23:00+02:00",
+            "2026-05-06T14:23:00-05:30",
+            "2026-05-06T14:23:00.123Z",
+            "2026-05-06 14:23:00",
+            "2026-05-06T14:23:00",
+        ]
+
+        for value in invalid_values:
+            result = mcp_server.tool_kg_add(
+                "Alice",
+                "works_at",
+                "Acme",
+                valid_from=value,
+            )
+
+            assert result["success"] is False, value
+            assert "valid_from" in result["error"]
+            assert "YYYY-MM-DDTHH:MM:SSZ" in result["error"]
+
+    def test_kg_query_rejects_non_canonical_datetime_as_of(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        from mempalace import mcp_server
+
+        invalid_values = [
+            "2026-05-06T14:23:00+02:00",
+            "2026-05-06T14:23:00-05:30",
+            "2026-05-06T14:23:00.123Z",
+            "2026-05-06 14:23:00",
+            "2026-05-06T14:23:00",
+        ]
+
+        for value in invalid_values:
+            result = mcp_server.tool_kg_query(
+                "Alice",
+                as_of=value,
+                direction="outgoing",
+            )
+
+            assert "error" in result, value
+            assert "as_of" in result["error"]
+            assert "YYYY-MM-DDTHH:MM:SSZ" in result["error"]
+
+    def test_kg_invalidate_rejects_non_canonical_ended(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        kg.add_triple(
             "Alice",
-            as_of="2026-05-06T14:23",
-            direction="outgoing",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:00:00Z",
         )
 
-        assert "error" in result
-        assert "as_of" in result["error"]
-        assert "ISO-8601 date or datetime" in result["error"]
+        from mempalace import mcp_server
+
+        invalid_values = [
+            "2026-05-06T14:23:00+02:00",
+            "2026-05-06T14:23:00-05:30",
+            "2026-05-06T14:23:00.123Z",
+            "2026-05-06 14:23:00",
+            "2026-05-06T14:23:00",
+        ]
+
+        for value in invalid_values:
+            result = mcp_server.tool_kg_invalidate(
+                "Alice",
+                "works_at",
+                "Acme",
+                ended=value,
+            )
+
+            assert result["success"] is False, value
+            assert "ended" in result["error"]
+            assert "YYYY-MM-DDTHH:MM:SSZ" in result["error"]
+
+    def test_kg_add_rejects_timezone_offset_datetime(self, monkeypatch, config, palace_path, kg):
+        _patch_mcp_server(monkeypatch, config, kg)
+
+        from mempalace import mcp_server
+
+        result = mcp_server.tool_kg_add(
+            "Alice",
+            "works_at",
+            "Acme",
+            valid_from="2026-05-06T14:23:00+02:00",
+        )
+
+        assert result["success"] is False
+        assert "valid_from" in result["error"]
+        assert "YYYY-MM-DDTHH:MM:SSZ" in result["error"]
 
 
 # ── Diary Tools ─────────────────────────────────────────────────────────

From 6694198c292aa505caaa26b32cf45dc5f1a24084 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 08:40:11 +0000
Subject: [PATCH 109/127] fix(kg): support Python 3.9 type annotations

---
 mempalace/knowledge_graph.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 3548502..22a0066 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -42,6 +42,7 @@
 import threading
 from datetime import date, datetime
 from pathlib import Path
+from typing import Optional
 from .config import sanitize_iso_temporal
 
 
@@ -52,7 +53,7 @@ def _is_date_only_temporal(value: str) -> bool:
     return isinstance(value, str) and len(value) == 10 and value[4] == "-" and value[7] == "-"
 
 
-def _temporal_start_key(value: str | None) -> str | None:
+def _temporal_start_key(value: Optional[str]) -> Optional[str]:
     """Return the comparable instant for a valid_from/as_of value."""
 
     if value is None:
@@ -64,7 +65,7 @@ def _temporal_start_key(value: str | None) -> str | None:
     return value
 
 
-def _temporal_end_key(value: str | None) -> str | None:
+def _temporal_end_key(value: Optional[str]) -> Optional[str]:
     """Return the comparable instant for a valid_to value.
 
     Date-only valid_to values represent the whole day for backward
@@ -80,7 +81,7 @@ def _temporal_end_key(value: str | None) -> str | None:
     return value
 
 
-def _triple_valid_at(valid_from: str | None, valid_to: str | None, as_of: str) -> bool:
+def _triple_valid_at(valid_from: Optional[str], valid_to: Optional[str], as_of: str) -> bool:
     as_of_key = _temporal_start_key(as_of)
     valid_from_key = _temporal_start_key(valid_from)
     valid_to_key = _temporal_end_key(valid_to)

From db69d1f1cccc108d13cd7585badbc19180d35ce3 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 14:21:19 +0000
Subject: [PATCH 110/127] fix(kg): tighten canonical temporal datetime support

---
 mempalace/config.py          |   4 +-
 mempalace/knowledge_graph.py | 106 ++++++++++++++++++++++++++---------
 2 files changed, 83 insertions(+), 27 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index 608a938..dfeae63 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -87,8 +87,8 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
 # (as_of, valid_from, valid_to, ended).
 #
 # The KG stores temporal values as TEXT. Lexicographic comparisons are only
-# safe when all datetime values use one canonical shape. Accept full dates
-# for legacy compatibility and exact UTC datetimes for sub-day precision.
+# safe when datetime values use one canonical shape. Accept full dates for
+# legacy compatibility and exact UTC datetimes for sub-day precision.
 #
 # Accepted:
 #   YYYY-MM-DD
diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 22a0066..ab653ff 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -95,6 +95,51 @@ def _triple_valid_at(valid_from: Optional[str], valid_to: Optional[str], as_of:
     return True
 
 
+def _sql_temporal_start_expr(column: str) -> str:
+    """SQLite expression for comparing valid_from-style temporal values."""
+
+    return (
+        f"CASE WHEN length({column}) = 10 "
+        f"AND substr({column}, 5, 1) = '-' "
+        f"AND substr({column}, 8, 1) = '-' "
+        f"THEN {column} || 'T00:00:00Z' ELSE {column} END"
+    )
+
+
+def _sql_temporal_end_expr(column: str) -> str:
+    """SQLite expression for comparing valid_to-style temporal values."""
+
+    return (
+        f"CASE WHEN length({column}) = 10 "
+        f"AND substr({column}, 5, 1) = '-' "
+        f"AND substr({column}, 8, 1) = '-' "
+        f"THEN {column} || 'T23:59:59Z' ELSE {column} END"
+    )
+
+
+def _temporal_filter_sql(as_of: str) -> tuple[str, list[str]]:
+    """Return SQL and parameters for an as-of temporal filter.
+
+    Date-only KG values are normalized for comparison:
+
+    - valid_from='2026-05-06' compares as '2026-05-06T00:00:00Z'
+    - valid_to='2026-05-06' compares as '2026-05-06T23:59:59Z'
+
+    This keeps legacy date-only facts working when callers query with
+    canonical UTC datetimes such as '2026-05-06T15:00:00Z'.
+    """
+
+    as_of_key = _temporal_start_key(as_of)
+    valid_from_expr = _sql_temporal_start_expr("t.valid_from")
+    valid_to_expr = _sql_temporal_end_expr("t.valid_to")
+
+    return (
+        f" AND (t.valid_from IS NULL OR {valid_from_expr} <= ?) "
+        f"AND (t.valid_to IS NULL OR {valid_to_expr} >= ?)",
+        [as_of_key, as_of_key],
+    )
+
+
 class KnowledgeGraph:
     def __init__(self, db_path: str = None):
         self.db_path = db_path or DEFAULT_KG_PATH
@@ -212,19 +257,21 @@ def add_triple(
 
         ``source_drawer_id`` and ``adapter_name`` are RFC 002 §5.5 provenance
         fields populated by adapters that advertise ``supports_kg_triples``;
-        they default to ``None`` so every existing caller stays source-compatible.
+        they default to ``None`` so every existing caller stays
+        source-compatible.
 
         Examples:
             add_triple("Max", "child_of", "Alice", valid_from="2015-04-01")
             add_triple("Max", "does", "swimming", valid_from="2025-01-01")
-            add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01", valid_to="2026-02")
+            add_triple("Alice", "worried_about", "Max injury", valid_from="2026-01-01")
         """
+
         valid_from = sanitize_iso_temporal(valid_from, "valid_from")
         valid_to = sanitize_iso_temporal(valid_to, "valid_to")
 
-        # Reject inverted intervals. Use temporal comparison keys rather than raw
-        # string comparison so legacy date-only values and canonical UTC datetimes can
-        # safely coexist.
+        # Reject inverted intervals. Use temporal comparison keys rather than
+        # raw string comparison so legacy date-only values and canonical UTC
+        # datetimes can safely coexist.
         if (
             valid_from is not None
             and valid_to is not None
@@ -244,10 +291,12 @@ def add_triple(
             conn = self._conn()
             with conn:
                 conn.execute(
-                    "INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (sub_id, subject)
+                    "INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)",
+                    (sub_id, subject),
                 )
                 conn.execute(
-                    "INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)", (obj_id, obj)
+                    "INSERT OR IGNORE INTO entities (id, name) VALUES (?, ?)",
+                    (obj_id, obj),
                 )
 
                 # Check for existing identical triple
@@ -255,17 +304,14 @@ def add_triple(
                     "SELECT id FROM triples WHERE subject=? AND predicate=? AND object=? AND valid_to IS NULL",
                     (sub_id, pred, obj_id),
                 ).fetchone()
-
                 if existing:
                     return existing["id"]  # Already exists and still valid
 
                 triple_id = f"t_{sub_id}_{pred}_{obj_id}_{hashlib.sha256(f'{valid_from}{datetime.now().isoformat()}'.encode()).hexdigest()[:12]}"
-
                 conn.execute(
                     """INSERT INTO triples (
-                        id, subject, predicate, object,
-                        valid_from, valid_to, confidence,
-                        source_closet, source_file,
+                        id, subject, predicate, object, valid_from, valid_to,
+                        confidence, source_closet, source_file,
                         source_drawer_id, adapter_name
                     ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
                     (
@@ -282,7 +328,7 @@ def add_triple(
                         adapter_name,
                     ),
                 )
-        return triple_id
+                return triple_id
 
     def invalidate(self, subject: str, predicate: str, obj: str, ended: str = None):
         """Mark a relationship as no longer valid (set valid_to date/time)."""
@@ -328,18 +374,23 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
         as_of = sanitize_iso_temporal(as_of, "as_of")
         eid = self._entity_id(name)
         results = []
+
+        temporal_sql = ""
+        temporal_params = []
+        if as_of:
+            temporal_sql, temporal_params = _temporal_filter_sql(as_of)
+
         with self._lock:
             conn = self._conn()
 
             if direction in ("outgoing", "both"):
                 query = (
                     "SELECT t.*, e.name as obj_name FROM triples t "
-                    "JOIN entities e ON t.object = e.id WHERE t.subject = ?"
+                    "JOIN entities e ON t.object = e.id WHERE t.subject = ?" + temporal_sql
                 )
-                for row in conn.execute(query, [eid]).fetchall():
-                    if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
-                        continue
+                params = [eid] + temporal_params
 
+                for row in conn.execute(query, params).fetchall():
                     results.append(
                         {
                             "direction": "outgoing",
@@ -357,12 +408,11 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
             if direction in ("incoming", "both"):
                 query = (
                     "SELECT t.*, e.name as sub_name FROM triples t "
-                    "JOIN entities e ON t.subject = e.id WHERE t.object = ?"
+                    "JOIN entities e ON t.subject = e.id WHERE t.object = ?" + temporal_sql
                 )
-                for row in conn.execute(query, [eid]).fetchall():
-                    if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
-                        continue
+                params = [eid] + temporal_params
 
+                for row in conn.execute(query, params).fetchall():
                     results.append(
                         {
                             "direction": "incoming",
@@ -376,12 +426,14 @@ def query_entity(self, name: str, as_of: str = None, direction: str = "outgoing"
                             "current": row["valid_to"] is None,
                         }
                     )
+
         return results
 
     def query_relationship(self, predicate: str, as_of: str = None):
         """Get all triples with a given relationship type."""
         as_of = sanitize_iso_temporal(as_of, "as_of")
         pred = predicate.lower().replace(" ", "_")
+
         query = """
             SELECT t.*, s.name as sub_name, o.name as obj_name
             FROM triples t
@@ -389,13 +441,17 @@ def query_relationship(self, predicate: str, as_of: str = None):
             JOIN entities o ON t.object = o.id
             WHERE t.predicate = ?
         """
+        params = [pred]
+
+        if as_of:
+            temporal_sql, temporal_params = _temporal_filter_sql(as_of)
+            query += temporal_sql
+            params.extend(temporal_params)
+
         results = []
         with self._lock:
             conn = self._conn()
-            for row in conn.execute(query, [pred]).fetchall():
-                if as_of and not _triple_valid_at(row["valid_from"], row["valid_to"], as_of):
-                    continue
-
+            for row in conn.execute(query, params).fetchall():
                 results.append(
                     {
                         "subject": row["sub_name"],

From 29c0c8059e2605e2fac53f28b585fd9366856d69 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 14:44:04 +0000
Subject: [PATCH 111/127] fix(kg): close sqlite connections during cleanup

---
 mempalace/knowledge_graph.py  | 18 ++++++++++++++++++
 tests/test_knowledge_graph.py | 23 +++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index ab653ff..8c3a749 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -221,6 +221,24 @@ def close(self):
                 self._connection.close()
                 self._connection = None
 
+    def __enter__(self):
+        """Allow KnowledgeGraph to be used as a context manager."""
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        """Close the SQLite connection when leaving a context manager block."""
+        self.close()
+        return False
+
+    def __del__(self):
+        """Best-effort cleanup for callers/tests that forget to call close()."""
+        try:
+            self.close()
+        except Exception:
+            # Destructors must never raise, especially during interpreter
+            # shutdown when module globals may already be partially torn down.
+            pass
+
     def _entity_id(self, name: str) -> str:
         return name.lower().replace(" ", "_").replace("'", "")
 
diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py
index 7f02253..8e9c811 100644
--- a/tests/test_knowledge_graph.py
+++ b/tests/test_knowledge_graph.py
@@ -6,6 +6,8 @@
 """
 
 import pytest
+import sqlite3
+from mempalace.knowledge_graph import KnowledgeGraph
 
 
 class TestEntityOperations:
@@ -299,3 +301,24 @@ def test_invalidate_rejects_timezone_offset_ended(self, kg):
                 "Acme",
                 ended="2026-05-06T20:30:00-05:00",
             )
+
+
+class TestKnowledgeGraphConnectionCleanup:
+    def test_close_closes_connection_and_resets_handle(self, tmp_path):
+        kg = KnowledgeGraph(str(tmp_path / "kg.sqlite3"))
+        conn = kg._conn()
+
+        kg.close()
+
+        assert kg._connection is None
+        with pytest.raises(sqlite3.ProgrammingError):
+            conn.execute("SELECT 1")
+
+    def test_context_manager_closes_connection(self, tmp_path):
+        with KnowledgeGraph(str(tmp_path / "kg.sqlite3")) as kg:
+            conn = kg._conn()
+            kg.add_entity("Alice")
+
+        assert kg._connection is None
+        with pytest.raises(sqlite3.ProgrammingError):
+            conn.execute("SELECT 1")

From 1fbcb739c9c19dbca64b672ff389b58a2feb1b90 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Fri, 8 May 2026 15:27:54 +0000
Subject: [PATCH 112/127] fix(kg): tighten temporal handling and cleanup

---
 mempalace/config.py          | 12 +++++++++---
 mempalace/knowledge_graph.py | 23 -----------------------
 tests/conftest.py            | 10 ++++++++++
 tests/test_config.py         |  4 ++++
 4 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/mempalace/config.py b/mempalace/config.py
index dfeae63..ab478ec 100644
--- a/mempalace/config.py
+++ b/mempalace/config.py
@@ -93,14 +93,16 @@ def sanitize_kg_value(value: str, field_name: str = "value") -> str:
 # Accepted:
 #   YYYY-MM-DD
 #   YYYY-MM-DDTHH:MM:SSZ
+#   YYYY-MM-DDTHH:MM:SS+00:00  (normalized to ...Z)
 #
 # Rejected:
-#   partial dates, naive datetimes, timezone offsets, fractional seconds,
-#   and SQLite-style space-separated datetimes.
+#   partial dates, naive datetimes, non-UTC timezone offsets, fractional
+#   seconds, and SQLite-style space-separated datetimes.
 _ISO_DATE_RE = re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$")
 
 _ISO_UTC_DATETIME_RE = re.compile(
-    r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])" r"T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\dZ$"
+    r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])"
+    r"T(?:[01]\d|2[0-3]):[0-5]\d:[0-5]\d(?:Z|\+00:00)$"
 )
 
 
@@ -127,6 +129,7 @@ def sanitize_iso_temporal(value, field_name: str = "date"):
 
     - ``YYYY-MM-DD``
     - ``YYYY-MM-DDTHH:MM:SSZ``
+    - ``YYYY-MM-DDTHH:MM:SS+00:00`` normalized to ``...Z``
 
     Partial dates are rejected because KG queries compare TEXT temporal values.
     Non-canonical datetime forms are rejected because mixed temporal string
@@ -148,6 +151,9 @@ def sanitize_iso_temporal(value, field_name: str = "date"):
             "(expected YYYY-MM-DD or YYYY-MM-DDTHH:MM:SSZ)"
         ) from None
 
+    if value.endswith("+00:00"):
+        value = f"{value[:-6]}Z"
+
     return value
 
 
diff --git a/mempalace/knowledge_graph.py b/mempalace/knowledge_graph.py
index 8c3a749..774ee79 100644
--- a/mempalace/knowledge_graph.py
+++ b/mempalace/knowledge_graph.py
@@ -81,20 +81,6 @@ def _temporal_end_key(value: Optional[str]) -> Optional[str]:
     return value
 
 
-def _triple_valid_at(valid_from: Optional[str], valid_to: Optional[str], as_of: str) -> bool:
-    as_of_key = _temporal_start_key(as_of)
-    valid_from_key = _temporal_start_key(valid_from)
-    valid_to_key = _temporal_end_key(valid_to)
-
-    if valid_from_key is not None and valid_from_key > as_of_key:
-        return False
-
-    if valid_to_key is not None and valid_to_key < as_of_key:
-        return False
-
-    return True
-
-
 def _sql_temporal_start_expr(column: str) -> str:
     """SQLite expression for comparing valid_from-style temporal values."""
 
@@ -230,15 +216,6 @@ def __exit__(self, exc_type, exc, tb):
         self.close()
         return False
 
-    def __del__(self):
-        """Best-effort cleanup for callers/tests that forget to call close()."""
-        try:
-            self.close()
-        except Exception:
-            # Destructors must never raise, especially during interpreter
-            # shutdown when module globals may already be partially torn down.
-            pass
-
     def _entity_id(self, name: str) -> str:
         return name.lower().replace(" ", "_").replace("'", "")
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4ed82ca..eb5c525 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -42,6 +42,16 @@ def _clear_cache():
         try:
             from mempalace import mcp_server
 
+            for kg in list(getattr(mcp_server, "_kg_by_path", {}).values()):
+                close = getattr(kg, "close", None)
+                if close is not None:
+                    try:
+                        close()
+                    except Exception:
+                        pass
+            if hasattr(mcp_server, "_kg_by_path"):
+                mcp_server._kg_by_path.clear()
+
             mcp_server._client_cache = None
             mcp_server._collection_cache = None
         except (ImportError, AttributeError):
diff --git a/tests/test_config.py b/tests/test_config.py
index 93dacf3..faea345 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -341,3 +341,7 @@ def test_iso_temporal_rejects_invalid_calendar_date():
 def test_iso_temporal_error_names_field():
     with pytest.raises(ValueError, match="as_of"):
         sanitize_iso_temporal("2026-05-06T14:23", "as_of")
+
+
+def test_iso_temporal_normalizes_plus_zero_offset_to_z():
+    assert sanitize_iso_temporal("2026-05-06T14:23:00+00:00") == "2026-05-06T14:23:00Z"

From 1d3eecbf9d1d8b6b2395254397b358580e4a1225 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 9 May 2026 03:16:03 +0500
Subject: [PATCH 113/127] feat(sync): add gitignore-aware drawer prune (#1252)

Add `mempalace sync` CLI command and `mempalace_sync` MCP tool that
prune drawers whose source files are gitignored, deleted, or moved
out of the project. Reuses the existing GitignoreMatcher
infrastructure in mempalace/miner.py so the same gitignore rules
that block ingest also drive the corresponding cleanup.

Closes #1252.
---
 mempalace/cli.py               |  111 ++++
 mempalace/mcp_server.py        |   52 ++
 mempalace/sync.py              |  298 +++++++++
 tests/test_sync.py             | 1128 ++++++++++++++++++++++++++++++++
 website/reference/mcp-tools.md |   16 +-
 5 files changed, 1604 insertions(+), 1 deletion(-)
 create mode 100644 mempalace/sync.py
 create mode 100644 tests/test_sync.py

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 964fa84..395130d 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -579,6 +579,84 @@ def cmd_sweep(args):
         sys.exit(1)
 
 
+def cmd_sync(args):
+    """Prune drawers whose source files are gitignored, deleted, or moved (#1252)."""
+    from .mcp_server import _wal_log
+    from .palace import MineAlreadyRunning
+    from .sync import sync_palace
+
+    palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+
+    if not os.path.isdir(palace_path):
+        print(f"\n  No palace found at {palace_path}")
+        return
+
+    project_dirs = []
+    if args.dir:
+        project_dirs.append(os.path.expanduser(args.dir))
+    project_dirs.extend(os.path.expanduser(r) for r in args.root)
+    project_dirs = project_dirs or None
+
+    print(f"\n{'=' * 55}")
+    print("  MemPalace Sync — Gitignore-aware drawer prune")
+    print(f"{'=' * 55}")
+    print(f"  Palace:   {palace_path}")
+    if args.wing:
+        print(f"  Wing:     {args.wing}")
+    if project_dirs:
+        for p in project_dirs:
+            print(f"  Project:  {p}")
+    if args.dry_run:
+        print("  Mode:     DRY RUN (no deletions)")
+    else:
+        print("  Mode:     APPLY (deleting drawers)")
+    print(f"{'-' * 55}\n")
+
+    try:
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=project_dirs,
+            wing=args.wing,
+            dry_run=args.dry_run,
+            wal_log=_wal_log,
+        )
+    except MineAlreadyRunning as exc:
+        print(f"mempalace: {exc}", file=sys.stderr)
+        sys.exit(1)
+    except ValueError as exc:
+        print(f"mempalace: {exc}", file=sys.stderr)
+        sys.exit(2)
+    except Exception as exc:
+        print(f"mempalace: sync failed: {exc}", file=sys.stderr)
+        sys.exit(1)
+
+    removed_suffix = "(would remove)" if args.dry_run else "(removed)"
+    print(f"  Scanned:        {report['scanned']}")
+    print(f"  Kept:           {report['kept']}")
+    print(f"  Gitignored:     {report['gitignored']}  {removed_suffix}")
+    print(f"  Missing:        {report['missing']}  {removed_suffix}")
+    print(f"  No source:      {report['no_source']}  (kept)")
+    print(f"  Out of scope:   {report['out_of_scope']}  (kept)")
+
+    by_source = report.get("by_source") or {}
+    if by_source:
+        top = sorted(by_source.items(), key=lambda kv: -kv[1])[:5]
+        label = "Top sources to remove" if args.dry_run else "Top sources removed"
+        print(f"\n  {label}:")
+        for src, n in top:
+            print(f"    {src}  ({n})")
+
+    if args.dry_run:
+        if report["gitignored"] + report["missing"] > 0:
+            print("\n  Re-run with --apply to commit these deletions.")
+    else:
+        print(
+            f"\n  Removed {report['removed_drawers']} drawers, {report['removed_closets']} closets."
+        )
+
+    print(f"\n{'=' * 55}\n")
+
+
 def cmd_search(args):
     from .searcher import search, SearchError
 
@@ -1214,6 +1292,38 @@ def main():
         help="A .jsonl transcript file, or a directory to scan recursively",
     )
 
+    # sync
+    p_sync = sub.add_parser(
+        "sync",
+        help="Prune drawers whose source files are gitignored, deleted, or moved (#1252)",
+    )
+    p_sync.add_argument(
+        "dir",
+        nargs="?",
+        default=None,
+        help="Project root to sync (optional; auto-detects from drawer metadata)",
+    )
+    p_sync.add_argument("--wing", default=None, help="Limit to one wing")
+    p_sync.add_argument(
+        "--root",
+        action="append",
+        default=[],
+        help="Additional project root (repeatable)",
+    )
+    p_sync.add_argument(
+        "--dry-run",
+        dest="dry_run",
+        action="store_true",
+        default=True,
+        help="Preview only (default)",
+    )
+    p_sync.add_argument(
+        "--apply",
+        dest="dry_run",
+        action="store_false",
+        help="Actually delete drawers (overrides --dry-run; requires --wing or a project root)",
+    )
+
     # search
     p_search = sub.add_parser("search", help="Find anything, exact words")
     p_search.add_argument("query", help="What to search for")
@@ -1422,6 +1532,7 @@ def main():
         "split": cmd_split,
         "search": cmd_search,
         "sweep": cmd_sweep,
+        "sync": cmd_sync,
         "mcp": cmd_mcp,
         "compress": cmd_compress,
         "wake-up": cmd_wakeup,
diff --git a/mempalace/mcp_server.py b/mempalace/mcp_server.py
index 521cb07..c88e7d1 100644
--- a/mempalace/mcp_server.py
+++ b/mempalace/mcp_server.py
@@ -990,6 +990,40 @@ def tool_delete_drawer(drawer_id: str):
         return {"success": False, "error": str(e)}
 
 
+def tool_sync(project_dir: str = None, wing: str = None, apply: bool = False):
+    """Prune drawers whose source files are gitignored, missing, or moved (#1252)."""
+    global _metadata_cache
+    from .palace import MineAlreadyRunning
+    from .sync import sync_palace
+
+    if not _config.palace_path:
+        np = _no_palace()
+        return {"success": False, "error": np.get("error", "no palace"), "hint": np.get("hint")}
+    project_dirs = [project_dir] if project_dir else None
+    try:
+        try:
+            report = sync_palace(
+                palace_path=_config.palace_path,
+                project_dirs=project_dirs,
+                wing=wing,
+                dry_run=not apply,
+                wal_log=_wal_log,
+            )
+            return {"success": True, **report}
+        # Order matters: typed handlers must precede the bare Exception
+        # below, otherwise MineAlreadyRunning and ValueError fall into the
+        # generic "sync failed" branch and break the structured-error tests.
+        except MineAlreadyRunning as exc:
+            return {"success": False, "error": f"another mine is in progress: {exc}"}
+        except ValueError as exc:
+            return {"success": False, "error": str(exc)}
+        except Exception as exc:
+            return {"success": False, "error": f"sync failed: {exc}"}
+    finally:
+        if apply:
+            _metadata_cache = None
+
+
 def tool_get_drawer(drawer_id: str):
     """Fetch a single drawer by ID. Returns full content and metadata."""
     col = _get_collection()
@@ -1886,6 +1920,24 @@ def tool_reconnect():
         },
         "handler": tool_delete_drawer,
     },
+    "mempalace_sync": {
+        "description": "Prune drawers whose source files are gitignored, deleted, or moved. Returns dry-run report by default; pass apply=true to commit deletions.",
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "project_dir": {
+                    "type": "string",
+                    "description": "Project root to scope the sync (optional; auto-detected from drawer metadata if omitted)",
+                },
+                "wing": {"type": "string", "description": "Limit to one wing (optional)"},
+                "apply": {
+                    "type": "boolean",
+                    "description": "Actually delete drawers; default is dry-run preview",
+                },
+            },
+        },
+        "handler": tool_sync,
+    },
     "mempalace_get_drawer": {
         "description": "Fetch a single drawer by ID — returns full content and metadata.",
         "input_schema": {
diff --git a/mempalace/sync.py b/mempalace/sync.py
new file mode 100644
index 0000000..8b4c2d9
--- /dev/null
+++ b/mempalace/sync.py
@@ -0,0 +1,298 @@
+"""
+sync.py — Gitignore-aware drawer prune (#1252).
+
+Removes drawers whose source files are now gitignored, deleted, or moved
+out of the project. Reuses the same GitignoreMatcher infrastructure that
+the miner uses on the way in, so the same rules that block ingest also
+drive the corresponding cleanup.
+
+Usage:
+    from mempalace.sync import sync_palace
+    report = sync_palace(palace_path, project_dirs=["/repo"], dry_run=True)
+"""
+
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Callable, Optional, TypedDict
+
+from .miner import is_gitignored, load_gitignore_matcher
+from .palace import (
+    MineAlreadyRunning,
+    get_closets_collection,
+    get_collection,
+    mine_palace_lock,
+    purge_file_closets,
+)
+
+
+logger = logging.getLogger(__name__)
+_BATCH = 1000
+
+
+class SyncReport(TypedDict):
+    scanned: int
+    kept: int
+    gitignored: int
+    missing: int
+    no_source: int
+    out_of_scope: int
+    removed_drawers: int
+    removed_closets: int
+    dry_run: bool
+    by_source: dict[str, int]
+
+
+def _resolve_project_root(source_file: Path, project_roots: list) -> Optional[Path]:
+    """Return the longest project_root that source_file lives under."""
+    best: Optional[Path] = None
+    for root in project_roots:
+        try:
+            source_file.relative_to(root)
+        except ValueError:
+            continue
+        if best is None or len(str(root)) > len(str(best)):
+            best = root
+    return best
+
+
+def _ancestor_matchers(source_file: Path, root: Path, matcher_cache: dict) -> list:
+    """Build the ancestor-chain matcher list, root → file's parent.
+
+    Callers are expected to invoke this only after `_resolve_project_root`
+    confirms `source_file` lives under `root`. The defensive try/except
+    keeps the function safe if a future caller skips that check.
+    """
+    matchers: list = []
+    try:
+        parts = source_file.relative_to(root).parts
+    except ValueError:
+        return matchers
+    cursor = root
+    matcher = load_gitignore_matcher(cursor, matcher_cache)
+    if matcher is not None:
+        matchers.append(matcher)
+    for part in parts[:-1]:
+        cursor = cursor / part
+        matcher = load_gitignore_matcher(cursor, matcher_cache)
+        if matcher is not None:
+            matchers.append(matcher)
+    return matchers
+
+
+def _is_registry_row(meta: dict, drawer_id: str) -> bool:
+    """Convo miner sentinels track 'have I seen this transcript' — preserve them.
+
+    Deleting a `_reg_*` sentinel makes the next mine pass re-chunk and re-embed
+    the entire transcript even though its content has not changed.
+    """
+    if (meta or {}).get("room") == "_registry":
+        return True
+    if (meta or {}).get("ingest_mode") == "registry":
+        return True
+    if drawer_id and drawer_id.startswith("_reg_"):
+        return True
+    return False
+
+
+def _classify_drawer(
+    meta: dict, matcher_cache: dict, project_roots: list, drawer_id: str = ""
+) -> str:
+    """Classify a drawer by its source_file metadata.
+
+    Returns one of: kept, gitignored, missing, no_source, out_of_scope.
+    """
+    if _is_registry_row(meta, drawer_id):
+        return "kept"
+
+    source_file = (meta or {}).get("source_file")
+    if not source_file:
+        return "no_source"
+
+    src = Path(source_file)
+    if not src.is_absolute():
+        return "no_source"
+
+    root = _resolve_project_root(src, project_roots)
+    if root is None:
+        return "out_of_scope"
+
+    if not src.exists():
+        return "missing"
+
+    matchers = _ancestor_matchers(src, root, matcher_cache)
+    if matchers and is_gitignored(src, matchers, is_dir=False):
+        return "gitignored"
+
+    return "kept"
+
+
+def _iter_drawer_metadata(col, wing: Optional[str]):
+    """Yield (id, metadata) tuples from the drawers collection in batches."""
+    offset = 0
+    where = {"wing": wing} if wing else None
+    while True:
+        kwargs = {"include": ["metadatas"], "limit": _BATCH, "offset": offset}
+        if where:
+            kwargs["where"] = where
+        batch = col.get(**kwargs)
+        ids = batch.get("ids") or []
+        metas = batch.get("metadatas") or []
+        if not ids:
+            return
+        for drawer_id, meta in zip(ids, metas):
+            yield drawer_id, meta
+        if len(ids) < _BATCH:
+            return
+        offset += len(ids)
+
+
+def _auto_detect_project_roots(col, wing: Optional[str]) -> list:
+    """Walk drawer metadata once collecting candidate project roots.
+
+    A path is a project root if any ancestor up to filesystem root holds
+    a `.git` directory or a `.gitignore` file. The deepest such ancestor
+    wins, so nested-but-still-tracked subprojects are honoured.
+    `Path.parents` iterates deepest-first, so the first hit IS deepest.
+    """
+    roots = set()
+    for _, meta in _iter_drawer_metadata(col, wing):
+        source_file = (meta or {}).get("source_file")
+        if not source_file:
+            continue
+        src = Path(source_file)
+        if not src.is_absolute():
+            continue
+        for parent in src.parents:
+            if (parent / ".git").exists() or (parent / ".gitignore").is_file():
+                roots.add(parent.resolve(strict=False))
+                break
+    # Sort by depth (deepest first) with secondary lexicographic key for
+    # deterministic order when two roots share string length.
+    return sorted(roots, key=lambda p: (-len(str(p)), str(p)))
+
+
+def _normalize_project_dirs(project_dirs) -> list:
+    return [Path(p).resolve(strict=False) for p in project_dirs]
+
+
+def _delete_in_batches(col, ids: list, batch_size: int, wal_log: Optional[Callable]):
+    """Delete drawer IDs in batches, optionally logging each batch to WAL."""
+    deleted = 0
+    for i in range(0, len(ids), batch_size):
+        chunk = ids[i : i + batch_size]
+        col.delete(ids=chunk)
+        deleted += len(chunk)
+        if wal_log is not None:
+            wal_log(
+                "sync_prune",
+                {"first_id": chunk[0]},
+                {"removed_count": len(chunk)},
+            )
+    return deleted
+
+
+def sync_palace(
+    palace_path: str,
+    project_dirs: Optional[list] = None,
+    wing: Optional[str] = None,
+    dry_run: bool = True,
+    batch_size: int = _BATCH,
+    wal_log: Optional[Callable] = None,
+) -> SyncReport:
+    """Prune drawers whose source files are gitignored, missing, or moved.
+
+    Returns a SyncReport with bucket counts. Dry-run by default; pass
+    dry_run=False to actually delete drawers and matching closets.
+
+    Holds ``mine_palace_lock`` for the whole call so the classify pass and
+    the apply branch see the same drawer snapshot. Raises
+    ``MineAlreadyRunning`` if another mine is in progress on this palace.
+
+    On apply (``dry_run=False``), at least one of ``wing`` or
+    ``project_dirs`` must be set so a caller cannot accidentally prune
+    every wing in a multi-project palace via auto-detected roots.
+    """
+    if not dry_run and not wing and not project_dirs:
+        raise ValueError(
+            "sync apply requires explicit wing= or project_dirs= so it cannot "
+            "auto-prune every wing in a multi-project palace; pass --wing or "
+            "a project directory"
+        )
+    if project_dirs is not None and not project_dirs:
+        raise ValueError(
+            "project_dirs was provided but is empty; pass at least one project "
+            "root or pass project_dirs=None to auto-detect from drawer metadata"
+        )
+
+    counts = {
+        "scanned": 0,
+        "kept": 0,
+        "gitignored": 0,
+        "missing": 0,
+        "no_source": 0,
+        "out_of_scope": 0,
+    }
+    by_source: dict = defaultdict(int)
+    removable_ids: list = []
+    removable_sources: set = set()
+
+    with mine_palace_lock(palace_path):
+        col = get_collection(palace_path, create=False)
+
+        if project_dirs is not None:
+            roots = _normalize_project_dirs(project_dirs)
+        else:
+            roots = _auto_detect_project_roots(col, wing)
+
+        matcher_cache: dict = {}
+
+        for drawer_id, meta in _iter_drawer_metadata(col, wing):
+            counts["scanned"] += 1
+            bucket = _classify_drawer(meta or {}, matcher_cache, roots, drawer_id)
+            counts[bucket] += 1
+            if bucket in ("gitignored", "missing"):
+                removable_ids.append(drawer_id)
+                src = (meta or {}).get("source_file")
+                if src:
+                    removable_sources.add(src)
+                    by_source[src] += 1
+
+        report: SyncReport = {
+            **counts,
+            "removed_drawers": 0,
+            "removed_closets": 0,
+            "dry_run": dry_run,
+            "by_source": dict(by_source),
+        }
+
+        if dry_run or not removable_ids:
+            return report
+
+        report["removed_drawers"] = _delete_in_batches(col, removable_ids, batch_size, wal_log)
+
+        closets_col = None
+        try:
+            closets_col = get_closets_collection(palace_path, create=False)
+        except Exception as exc:
+            logger.warning("Closet purge skipped (collection unavailable): %s", exc)
+
+        closets_removed = 0
+        if closets_col is not None:
+            for source_file in removable_sources:
+                before = (
+                    closets_col.get(where={"source_file": source_file}, include=[]).get("ids") or []
+                )
+                if not before:
+                    continue
+                purge_file_closets(closets_col, source_file)
+                closets_removed += len(before)
+        report["removed_closets"] = closets_removed
+    return report
+
+
+__all__ = [
+    "MineAlreadyRunning",
+    "SyncReport",
+    "sync_palace",
+]
diff --git a/tests/test_sync.py b/tests/test_sync.py
new file mode 100644
index 0000000..f18261e
--- /dev/null
+++ b/tests/test_sync.py
@@ -0,0 +1,1128 @@
+"""
+test_sync.py — Tests for `mempalace.sync` (gitignore-aware drawer prune, #1252).
+
+Builds a focused fixture: a temp project with .gitignore + on-disk files +
+matching drawers, exercising every classification bucket sync produces.
+"""
+
+import os
+from pathlib import Path
+
+import chromadb
+import pytest
+
+
+def _seed_drawers(palace_path, repo_path, deleted_path):
+    """Populate the drawers collection with 6 entries covering all buckets."""
+    client = chromadb.PersistentClient(path=palace_path)
+    col = client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+
+    metas = [
+        {
+            "wing": "demo",
+            "room": "src",
+            "source_file": str(repo_path / "src" / "keep.py"),
+            "chunk_index": 0,
+            "added_by": "miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+        {
+            "wing": "demo",
+            "room": "build",
+            "source_file": str(repo_path / "build" / "ignored.py"),
+            "chunk_index": 0,
+            "added_by": "miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+        {
+            "wing": "demo",
+            "room": "logs",
+            "source_file": str(repo_path / "app.log"),
+            "chunk_index": 0,
+            "added_by": "miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+        {
+            "wing": "demo",
+            "room": "stale",
+            "source_file": str(deleted_path),
+            "chunk_index": 0,
+            "added_by": "miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+        {
+            "wing": "demo",
+            "room": "convo",
+            # No source_file key — convo / explicit-add drawers.
+            "chunk_index": 0,
+            "added_by": "convo_miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+        {
+            "wing": "demo",
+            "room": "elsewhere",
+            "source_file": "/tmp/elsewhere/x.md",
+            "chunk_index": 0,
+            "added_by": "miner",
+            "filed_at": "2026-05-09T00:00:00",
+        },
+    ]
+
+    col.add(
+        ids=[
+            "drawer_keep",
+            "drawer_gitignored_dir",
+            "drawer_gitignored_glob",
+            "drawer_missing",
+            "drawer_no_source",
+            "drawer_out_of_scope",
+        ],
+        documents=[f"doc {i}" for i in range(6)],
+        embeddings=[[float(i + 1), 0.0, 0.0] for i in range(6)],
+        metadatas=metas,
+    )
+    del client
+
+
+@pytest.fixture
+def synced_world(tmp_dir, palace_path):
+    """Temp project with .gitignore + on-disk files + matching drawers."""
+    repo_path = Path(tmp_dir) / "repo"
+    (repo_path / "src").mkdir(parents=True)
+    (repo_path / "build").mkdir()
+
+    # .gitignore: ignore build/ directory and any *.log file
+    (repo_path / ".gitignore").write_text("build/\n*.log\n")
+
+    # Files that exist on disk
+    (repo_path / "src" / "keep.py").write_text("# keep\n")
+    (repo_path / "build" / "ignored.py").write_text("# ignored by gitignore\n")
+    (repo_path / "app.log").write_text("log line\n")
+
+    # File that the drawer points to but no longer exists
+    deleted = repo_path / "deleted.py"
+    deleted.write_text("# was here\n")
+    deleted.unlink()
+
+    _seed_drawers(palace_path, repo_path, deleted)
+    return {"palace_path": palace_path, "repo_path": str(repo_path)}
+
+
+def _open_drawers(palace_path):
+    client = chromadb.PersistentClient(path=palace_path)
+    col = client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+    return client, col
+
+
+def _drawer_ids(col):
+    return set(col.get(include=[])["ids"])
+
+
+class TestSyncPalace:
+    def test_dry_run_classifies_correctly(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        report = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=True,
+        )
+        assert report["scanned"] == 6
+        assert report["gitignored"] == 2  # build/ignored.py, app.log
+        assert report["missing"] == 1  # deleted.py
+        assert report["no_source"] == 1
+        assert report["out_of_scope"] == 1
+        assert report["kept"] == 1  # only src/keep.py
+        assert report["dry_run"] is True
+        assert report["removed_drawers"] == 0
+
+        # Mutation check — collection still has all 6 drawers.
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            assert len(_drawer_ids(col)) == 6
+        finally:
+            del client
+
+    def test_apply_removes_gitignored_and_missing(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        report = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        assert report["dry_run"] is False
+        assert report["removed_drawers"] == 3  # 2 gitignored + 1 missing
+
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            survivors = _drawer_ids(col)
+            assert survivors == {
+                "drawer_keep",
+                "drawer_no_source",
+                "drawer_out_of_scope",
+            }
+        finally:
+            del client
+
+    def test_dry_run_does_not_touch_collection(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        client, col = _open_drawers(synced_world["palace_path"])
+        before = _drawer_ids(col)
+        del client
+
+        sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=True,
+        )
+
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            after = _drawer_ids(col)
+        finally:
+            del client
+        assert before == after
+
+    def test_wing_scope_filters(self, tmp_dir, palace_path):
+        """A drawer in another wing must survive a wing-scoped sync."""
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "build").mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("build/\n")
+        (repo_path / "build" / "ignored.py").write_text("# ignored\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_demo", "d_other"],
+            documents=["x", "y"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": str(repo_path / "build" / "ignored.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "other",
+                    "room": "build",
+                    "source_file": str(repo_path / "build" / "ignored.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="demo",
+            dry_run=False,
+        )
+
+        client, col = _open_drawers(palace_path)
+        try:
+            assert _drawer_ids(col) == {"d_other"}
+        finally:
+            del client
+
+    def test_no_source_file_drawers_preserved_on_apply(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            assert "drawer_no_source" in _drawer_ids(col)
+        finally:
+            del client
+
+    def test_out_of_scope_drawers_preserved(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            assert "drawer_out_of_scope" in _drawer_ids(col)
+        finally:
+            del client
+
+    def test_negated_gitignore_rules_respected(self, tmp_dir, palace_path):
+        """`!build/keep.py` must un-ignore one specific file under build/."""
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "build").mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("build/\n!build/keep.py\n")
+        (repo_path / "build" / "keep.py").write_text("# survivor\n")
+        (repo_path / "build" / "doomed.py").write_text("# doomed\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_keep", "d_doom"],
+            documents=["x", "y"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": str(repo_path / "build" / "keep.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": str(repo_path / "build" / "doomed.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            dry_run=False,
+        )
+
+        client, col = _open_drawers(palace_path)
+        try:
+            survivors = _drawer_ids(col)
+        finally:
+            del client
+        assert "d_keep" in survivors
+        assert "d_doom" not in survivors
+
+    def test_nested_gitignore_layers(self, tmp_dir, palace_path):
+        """Subdir .gitignore can deny what root allows."""
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "vendor").mkdir(parents=True)
+        # Root gitignore is empty.
+        (repo_path / ".gitignore").write_text("\n")
+        # Subdir gitignore ignores everything under vendor/.
+        (repo_path / "vendor" / ".gitignore").write_text("*.py\n")
+        (repo_path / "vendor" / "lib.py").write_text("# nested-ignored\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_nested"],
+            documents=["x"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "vendor",
+                    "source_file": str(repo_path / "vendor" / "lib.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            dry_run=False,
+        )
+
+        client, col = _open_drawers(palace_path)
+        try:
+            assert "d_nested" not in _drawer_ids(col)
+        finally:
+            del client
+
+    def test_closet_purge_runs_on_apply(self, synced_world):
+        """Closets pointing at removed sources must also disappear."""
+        from mempalace.sync import sync_palace
+
+        # Seed a closet referencing the to-be-pruned ignored.py source.
+        client = chromadb.PersistentClient(path=synced_world["palace_path"])
+        closets = client.get_or_create_collection(
+            "mempalace_closets", metadata={"hnsw:space": "cosine"}
+        )
+        ignored_path = str(Path(synced_world["repo_path"]) / "build" / "ignored.py")
+        closets.add(
+            ids=["closet_ignored_01"],
+            documents=["topic line"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": ignored_path,
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        assert report["removed_closets"] >= 1
+
+        client = chromadb.PersistentClient(path=synced_world["palace_path"])
+        closets = client.get_or_create_collection(
+            "mempalace_closets", metadata={"hnsw:space": "cosine"}
+        )
+        try:
+            assert closets.get(ids=["closet_ignored_01"])["ids"] == []
+        finally:
+            del client
+
+    def test_handles_empty_palace(self, palace_path):
+        from mempalace.sync import sync_palace
+
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        report = sync_palace(palace_path=palace_path, dry_run=True)
+        assert report["scanned"] == 0
+        assert report["removed_drawers"] == 0
+
+    def test_emits_wal_entries_on_apply(self, synced_world):
+        from mempalace.sync import sync_palace
+
+        seen = []
+
+        def fake_wal(operation, params, result=None):
+            seen.append((operation, params, result))
+
+        sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+            wal_log=fake_wal,
+        )
+
+        ops = [op for op, _, _ in seen]
+        assert "sync_prune" in ops
+        # F4 — result payload carries the audit trail.
+        sync_entry = next(e for e in seen if e[0] == "sync_prune")
+        op, params, result = sync_entry
+        assert result is not None and "removed_count" in result
+        assert result["removed_count"] >= 1
+        # Allow-list — params must be exactly the documented audit shape so
+        # any future leak (source_file, content, ID lists, etc.) trips a
+        # test failure rather than slipping through a deny-list.
+        assert set(params.keys()) <= {
+            "first_id"
+        }, f"WAL params drifted from the audit allow-list: {params.keys()}"
+
+    def test_registry_sentinels_preserved_on_apply(self, tmp_dir, palace_path):
+        """F2 regression: convo miner `_reg_*` sentinels must survive sync apply.
+
+        Deleting them forces full re-mine + re-embed of the transcript on the
+        next miner run, even though the transcript content has not changed.
+        """
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        repo_path.mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("transcripts/\n")
+        (repo_path / "transcripts").mkdir()
+        moved_transcript = repo_path / "transcripts" / "convo.jsonl"
+        moved_transcript.write_text("{}\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=[
+                "_reg_abc123_room_match",
+                "_reg_def456_meta_match",
+                "_reg_ghi789_id_match",
+            ],
+            documents=["[registry] x", "[registry] y", "[registry] z"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [3.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "_registry",
+                    "source_file": str(moved_transcript),
+                    "chunk_index": 0,
+                    "added_by": "convo_miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "demo",
+                    "room": "convo",
+                    "source_file": str(moved_transcript),
+                    "chunk_index": 0,
+                    "added_by": "convo_miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                    "ingest_mode": "registry",
+                },
+                {
+                    "wing": "demo",
+                    "room": "convo",
+                    "source_file": str(moved_transcript),
+                    "chunk_index": 0,
+                    "added_by": "convo_miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        # Sentinel transcript is gitignored; without F2 it would also delete
+        # the `_reg_*` sentinel rows.
+        sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            dry_run=False,
+        )
+
+        client, col = _open_drawers(palace_path)
+        try:
+            survivors = _drawer_ids(col)
+        finally:
+            del client
+        assert "_reg_abc123_room_match" in survivors  # room=_registry
+        assert "_reg_def456_meta_match" in survivors  # ingest_mode=registry
+        assert "_reg_ghi789_id_match" in survivors  # id prefix
+
+    def test_auto_detect_picks_deepest_root(self, tmp_dir, palace_path):
+        """F3 regression (white-box): when multiple ancestors hold markers
+        the DEEPEST one wins. Direct assertion on the helper avoids the
+        tautology of round-1's classifier-based test where ancestor walks
+        loaded the same matcher chain regardless of which root was picked.
+        """
+        from mempalace.sync import _auto_detect_project_roots
+
+        outer = Path(tmp_dir) / "outer"
+        inner = outer / "inner"
+        inner.mkdir(parents=True)
+        # Both have markers. Deepest wins.
+        (outer / ".gitignore").write_text("*.txt\n")
+        (inner / ".gitignore").write_text("*.py\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_inner"],
+            documents=["x"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "src",
+                    "source_file": str(inner / "x.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        client, col = _open_drawers(palace_path)
+        try:
+            roots = _auto_detect_project_roots(col, wing="demo")
+        finally:
+            del client
+
+        inner_resolved = inner.resolve(strict=False)
+        outer_resolved = outer.resolve(strict=False)
+        assert inner_resolved in roots, f"expected inner in roots, got {roots}"
+        assert (
+            outer_resolved not in roots
+        ), f"deepest should win exclusively: roots={roots}, outer leaked"
+
+    def test_apply_with_empty_project_dirs_raises(self, palace_path):
+        """Round-2 P1: `project_dirs=[]` (empty list) with apply must raise,
+        not silently classify everything as out_of_scope."""
+        from mempalace.sync import sync_palace
+
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        with pytest.raises(ValueError, match="empty"):
+            sync_palace(
+                palace_path=palace_path,
+                project_dirs=[],
+                wing="demo",
+                dry_run=False,
+            )
+
+    def test_closet_log_warning_when_collection_unavailable(
+        self, monkeypatch, synced_world, caplog
+    ):
+        """F7 regression: closets-collection-missing logs a warning."""
+        import logging
+
+        from mempalace import sync as sync_mod
+        from mempalace.sync import sync_palace
+
+        def boom(*args, **kwargs):
+            raise RuntimeError("simulated missing closets collection")
+
+        monkeypatch.setattr(sync_mod, "get_closets_collection", boom)
+
+        with caplog.at_level(logging.WARNING, logger="mempalace.sync"):
+            sync_palace(
+                palace_path=synced_world["palace_path"],
+                project_dirs=[synced_world["repo_path"]],
+                dry_run=False,
+            )
+        assert any(
+            "Closet purge skipped" in record.getMessage() for record in caplog.records
+        ), f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
+
+    def test_metadata_cache_cleared_on_exception(self, monkeypatch, config, synced_world, kg):
+        """F9 regression: tool_sync's try/finally must clear `_metadata_cache`
+        even if sync_palace raises mid-apply.
+
+        Tracks an explicit `called` flag on the explode mock so a refactor
+        that bypasses the patched name (and lets the real sync_palace run)
+        cannot fake-pass — the assertion below verifies the patched explode
+        actually ran before the cache was cleared.
+        """
+        from mempalace import mcp_server
+
+        # Reconfigure to point at synced_world.
+        from mempalace.config import MempalaceConfig
+        import json
+
+        cfg_dir = Path(synced_world["palace_path"]).parent / "cfg_for_cache_test"
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        with open(cfg_dir / "config.json", "w") as f:
+            json.dump({"palace_path": synced_world["palace_path"]}, f)
+        monkeypatch.setattr(mcp_server, "_config", MempalaceConfig(config_dir=str(cfg_dir)))
+        monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)
+        monkeypatch.setattr(mcp_server, "_metadata_cache", ["dirty-cache-marker"])
+
+        called = {"n": 0}
+
+        def explode(*args, **kwargs):
+            called["n"] += 1
+            raise RuntimeError("simulated mid-apply failure")
+
+        monkeypatch.setattr("mempalace.sync.sync_palace", explode)
+
+        # tool_sync's broad except catches RuntimeError → returns structured error.
+        result = mcp_server.tool_sync(
+            project_dir=synced_world["repo_path"], wing="demo", apply=True
+        )
+        assert called["n"] == 1, "explode mock did not actually run; test is a fake-pass"
+        assert result.get("success") is False
+        assert "simulated" in result.get("error", "")
+
+        assert (
+            mcp_server._metadata_cache is None
+        ), "F9: cache must be cleared even when sync_palace raises"
+
+    def test_sync_report_keys_stable(self, synced_world):
+        """Regression: SyncReport schema must not silently drop a field."""
+        from mempalace.sync import sync_palace
+
+        report = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=True,
+        )
+        expected = {
+            "scanned",
+            "kept",
+            "gitignored",
+            "missing",
+            "no_source",
+            "out_of_scope",
+            "removed_drawers",
+            "removed_closets",
+            "dry_run",
+            "by_source",
+        }
+        assert set(report.keys()) == expected
+
+    def test_batch_size_boundary(self, tmp_dir, palace_path):
+        """`_delete_in_batches` correctness at batch_size smaller than dataset."""
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        repo_path.mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("ignored/\n")
+        (repo_path / "ignored").mkdir()
+        n = 5
+        for i in range(n):
+            (repo_path / "ignored" / f"f{i}.py").write_text(f"# {i}\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=[f"d_{i}" for i in range(n)],
+            documents=[f"x{i}" for i in range(n)],
+            embeddings=[[float(i + 1), 0.0, 0.0] for i in range(n)],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "ignored",
+                    "source_file": str(repo_path / "ignored" / f"f{i}.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+                for i in range(n)
+            ],
+        )
+        del client
+
+        seen = []
+
+        def fake_wal(operation, params, result=None):
+            if operation == "sync_prune":
+                seen.append(result["removed_count"])
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="demo",
+            dry_run=False,
+            batch_size=2,
+            wal_log=fake_wal,
+        )
+        assert report["removed_drawers"] == n
+        # 5 ids at batch_size=2 → chunks of 2,2,1 → 3 wal entries
+        assert seen == [2, 2, 1], f"unexpected chunk sizes: {seen}"
+
+    def test_apply_is_idempotent(self, synced_world):
+        """Round-3: a second apply on the same palace must be a no-op."""
+        from mempalace.sync import sync_palace
+
+        first = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        assert first["removed_drawers"] >= 1
+
+        second = sync_palace(
+            palace_path=synced_world["palace_path"],
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+        assert second["removed_drawers"] == 0
+        assert second["gitignored"] == 0
+        assert second["missing"] == 0
+
+    def test_relative_source_file_classified_as_no_source(self, tmp_dir, palace_path):
+        """Round-3: a drawer whose source_file metadata is relative is upstream
+        corruption (miner writes absolute paths). Sync must NOT guess at
+        path resolution; it routes the drawer to `no_source` and leaves it."""
+        from mempalace.sync import sync_palace
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_relative"],
+            documents=["x"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "src",
+                    "source_file": "relative/path.py",  # malformed, not absolute
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        repo_path = Path(tmp_dir) / "repo"
+        repo_path.mkdir()
+        (repo_path / ".gitignore").write_text("*.py\n")
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="demo",
+            dry_run=False,
+        )
+        assert report["no_source"] == 1
+        assert report["removed_drawers"] == 0
+
+        client, col = _open_drawers(palace_path)
+        try:
+            assert "d_relative" in _drawer_ids(col)
+        finally:
+            del client
+
+    def test_overlapping_project_dirs_picks_longest(self, tmp_dir, palace_path):
+        """`_resolve_project_root` longest-prefix matching: nested project
+        dirs both contain the source; the deeper (longer) one wins."""
+        from mempalace.sync import sync_palace
+
+        outer = Path(tmp_dir) / "outer"
+        inner = outer / "inner"
+        inner.mkdir(parents=True)
+        # Outer .gitignore would NOT block file. Inner .gitignore blocks it.
+        (outer / ".gitignore").write_text("# empty\n")
+        (inner / ".gitignore").write_text("x.py\n")
+        (inner / "x.py").write_text("# inner-ignored\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_x"],
+            documents=["x"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "src",
+                    "source_file": str(inner / "x.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        # Pass BOTH outer AND inner as project_dirs. inner is the longest
+        # prefix, so it should be the chosen root and inner/.gitignore
+        # rules apply (file is ignored → drawer removed).
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(outer), str(inner)],
+            wing="demo",
+            dry_run=False,
+        )
+        assert report["gitignored"] == 1, f"expected 1 gitignored, got {report}"
+
+    def test_apply_without_scope_raises(self, palace_path):
+        """F6: apply=True with both wing=None AND project_dirs=None refuses."""
+        from mempalace.sync import sync_palace
+
+        # Empty palace; we never reach delete code, but the guard must fire
+        # before any work.
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        with pytest.raises(ValueError, match="explicit wing="):
+            sync_palace(palace_path=palace_path, dry_run=False)
+
+        # Dry-run with no scope is still allowed — preview is read-only.
+        report = sync_palace(palace_path=palace_path, dry_run=True)
+        assert report["dry_run"] is True
+
+    @pytest.mark.skipif(os.name == "nt", reason="fcntl-based contention test is POSIX only")
+    def test_mine_already_running_propagates(self, synced_world):
+        """F1 + T4: sync acquires `mine_palace_lock` for the whole call.
+
+        Hold the palace lock via raw fcntl on a separate open file
+        description; mine_palace_lock opens its own handle and must
+        raise MineAlreadyRunning rather than silently running against
+        a partial snapshot.
+        """
+        import fcntl
+        import hashlib
+
+        from mempalace.palace import MineAlreadyRunning
+        from mempalace.sync import sync_palace
+
+        palace_path = synced_world["palace_path"]
+        resolved = os.path.realpath(os.path.expanduser(palace_path))
+        palace_key = hashlib.sha256(os.path.normcase(resolved).encode()).hexdigest()[:16]
+        lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
+        os.makedirs(lock_dir, exist_ok=True)
+        lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
+        Path(lock_path).touch()
+
+        with open(lock_path, "r+") as lf:
+            fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            try:
+                with pytest.raises(MineAlreadyRunning):
+                    sync_palace(
+                        palace_path=palace_path,
+                        project_dirs=[synced_world["repo_path"]],
+                        dry_run=True,
+                    )
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
+        # Lock released — sync now succeeds.
+        sync_palace(
+            palace_path=palace_path,
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=True,
+        )
+
+
+class TestSyncMcpTool:
+    """T2: `mempalace_sync` MCP entry point must keep apply polarity stable."""
+
+    def _patch(self, monkeypatch, config, kg):
+        from mempalace import mcp_server
+
+        monkeypatch.setattr(mcp_server, "_config", config)
+        monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)
+
+    def test_default_is_dry_run(self, monkeypatch, config, palace_path, kg):
+        from mempalace import mcp_server
+
+        self._patch(monkeypatch, config, kg)
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        report = mcp_server.tool_sync(project_dir=palace_path)
+        assert report["dry_run"] is True
+
+    def test_success_true_on_dry_run(self, monkeypatch, config, palace_path, kg):
+        """Round-4: success path returns `success: True` for API symmetry
+        with the structured-error branches that all return `success: False`."""
+        from mempalace import mcp_server
+
+        self._patch(monkeypatch, config, kg)
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        report = mcp_server.tool_sync(project_dir=palace_path)
+        assert report.get("success") is True
+        assert report.get("dry_run") is True
+
+    def test_apply_true_is_destructive(self, monkeypatch, config, synced_world, kg):
+        from mempalace import mcp_server
+
+        # Rebuild config to point at synced_world's palace.
+        from mempalace.config import MempalaceConfig
+        import json
+
+        cfg_dir = Path(synced_world["palace_path"]).parent / "cfg_for_mcp_test"
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        with open(cfg_dir / "config.json", "w") as f:
+            json.dump({"palace_path": synced_world["palace_path"]}, f)
+        cfg = MempalaceConfig(config_dir=str(cfg_dir))
+        self._patch(monkeypatch, cfg, kg)
+
+        report = mcp_server.tool_sync(
+            project_dir=synced_world["repo_path"], apply=True, wing="demo"
+        )
+        assert report["dry_run"] is False
+        assert report["removed_drawers"] >= 1
+
+    def test_no_palace_returns_structured_error(self, monkeypatch, kg):
+        """Round-3: tool_sync must keep the {success:False,error:...} contract
+        even on the early `_no_palace` short-circuit, not return the bare
+        legacy `{error,hint}` dict."""
+        from mempalace import mcp_server
+
+        class _EmptyConfig:
+            palace_path = ""
+            collection_name = "mempalace_drawers"
+
+        monkeypatch.setattr(mcp_server, "_config", _EmptyConfig())
+        monkeypatch.setattr(mcp_server, "_get_kg", lambda: kg)
+
+        result = mcp_server.tool_sync()
+        assert result.get("success") is False
+        assert "error" in result
+
+    def test_apply_without_scope_returns_structured_error(
+        self, monkeypatch, config, palace_path, kg
+    ):
+        """Round-2 P0: tool_sync must return {success: False, error: ...}
+        rather than letting ValueError propagate to the MCP client."""
+        from mempalace import mcp_server
+
+        client = chromadb.PersistentClient(path=palace_path)
+        client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
+        del client
+
+        self._patch(monkeypatch, config, kg)
+        result = mcp_server.tool_sync(apply=True)  # no project_dir, no wing
+        assert result.get("success") is False
+        assert "wing=" in result.get("error", "") or "project_dirs" in result.get("error", "")
+
+    @pytest.mark.skipif(os.name == "nt", reason="fcntl-based contention test is POSIX only")
+    def test_lock_contention_returns_structured_error(self, monkeypatch, config, synced_world, kg):
+        """Round-2 P0: tool_sync with apply=True under contention returns
+        a structured `{success: False, error: ...}` instead of raising."""
+        import fcntl
+        import hashlib
+
+        from mempalace import mcp_server
+        from mempalace.config import MempalaceConfig
+        import json
+
+        # Wire MCP config at synced_world.
+        cfg_dir = Path(synced_world["palace_path"]).parent / "cfg_for_lock_test"
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        with open(cfg_dir / "config.json", "w") as f:
+            json.dump({"palace_path": synced_world["palace_path"]}, f)
+        self._patch(monkeypatch, MempalaceConfig(config_dir=str(cfg_dir)), kg)
+
+        # Compute lock path the same way mine_palace_lock does.
+        resolved = os.path.realpath(os.path.expanduser(synced_world["palace_path"]))
+        palace_key = hashlib.sha256(os.path.normcase(resolved).encode()).hexdigest()[:16]
+        lock_dir = os.path.join(os.path.expanduser("~"), ".mempalace", "locks")
+        os.makedirs(lock_dir, exist_ok=True)
+        lock_path = os.path.join(lock_dir, f"mine_palace_{palace_key}.lock")
+        Path(lock_path).touch()
+
+        with open(lock_path, "r+") as lf:
+            fcntl.flock(lf, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            try:
+                result = mcp_server.tool_sync(
+                    project_dir=synced_world["repo_path"], wing="demo", apply=True
+                )
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
+        assert result.get("success") is False
+        assert "another mine" in result.get("error", "").lower()
+
+
+class TestSyncCli:
+    """T1: `cmd_sync` argparse + dispatch wrapper round-trip."""
+
+    def test_dry_run_default_no_mutation(self, monkeypatch, tmp_dir, synced_world, capsys):
+        from mempalace import cli
+
+        argv = [
+            "mempalace",
+            "--palace",
+            synced_world["palace_path"],
+            "sync",
+            synced_world["repo_path"],
+        ]
+        monkeypatch.setattr("sys.argv", argv)
+        cli.main()
+
+        captured = capsys.readouterr().out
+        assert "DRY RUN" in captured
+        assert "would remove" in captured
+
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            assert len(_drawer_ids(col)) == 6  # synced_world seeds 6, dry-run touches none
+        finally:
+            del client
+
+    def test_apply_flag_deletes(self, monkeypatch, tmp_dir, synced_world, capsys):
+        from mempalace import cli
+
+        argv = [
+            "mempalace",
+            "--palace",
+            synced_world["palace_path"],
+            "sync",
+            synced_world["repo_path"],
+            "--apply",
+            "--wing",
+            "demo",
+        ]
+        monkeypatch.setattr("sys.argv", argv)
+        cli.main()
+
+        captured = capsys.readouterr().out
+        assert "Removed" in captured
+        assert "(removed)" in captured
+
+        client, col = _open_drawers(synced_world["palace_path"])
+        try:
+            survivors = _drawer_ids(col)
+        finally:
+            del client
+        assert survivors == {
+            "drawer_keep",
+            "drawer_no_source",
+            "drawer_out_of_scope",
+        }
+
+    def test_cli_emits_wal_on_apply(self, monkeypatch, synced_world):
+        """F8 regression: cmd_sync must wire `_wal_log` so CLI deletes are
+        audited. Without this, scripted CLI invocations leave no trail."""
+        from mempalace import cli, mcp_server
+
+        seen = []
+        original = mcp_server._wal_log
+
+        def recording_wal(operation, params, result=None):
+            seen.append((operation, params, result))
+            original(operation, params, result)
+
+        monkeypatch.setattr(mcp_server, "_wal_log", recording_wal)
+
+        argv = [
+            "mempalace",
+            "--palace",
+            synced_world["palace_path"],
+            "sync",
+            synced_world["repo_path"],
+            "--apply",
+            "--wing",
+            "demo",
+        ]
+        monkeypatch.setattr("sys.argv", argv)
+        cli.main()
+
+        ops = [op for op, _, _ in seen]
+        assert "sync_prune" in ops, f"CLI --apply did not emit WAL sync_prune entries; seen={ops}"
+
+    def test_apply_without_scope_exits_2(self, monkeypatch, synced_world, capsys):
+        """F6 + F8 CLI hardening: --apply with no scope exits non-zero."""
+        from mempalace import cli
+
+        argv = [
+            "mempalace",
+            "--palace",
+            synced_world["palace_path"],
+            "sync",
+            "--apply",
+        ]
+        monkeypatch.setattr("sys.argv", argv)
+        with pytest.raises(SystemExit) as exc_info:
+            cli.main()
+        assert exc_info.value.code == 2
diff --git a/website/reference/mcp-tools.md b/website/reference/mcp-tools.md
index 6866aa6..220b15e 100644
--- a/website/reference/mcp-tools.md
+++ b/website/reference/mcp-tools.md
@@ -1,6 +1,6 @@
 # MCP Tools Reference
 
-Detailed parameter schemas for all 29 MCP tools.
+Detailed parameter schemas for all 30 MCP tools.
 
 ## Palace — Read Tools
 
@@ -114,6 +114,20 @@ Delete a drawer by ID. Irreversible.
 
 ---
 
+### `mempalace_sync`
+
+Prune drawers whose source files are gitignored, deleted, or moved. Returns a dry-run report by default; pass `apply=true` to commit deletions.
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `project_dir` | string | No | Project root to scope the sync (auto-detected from drawer metadata if omitted) |
+| `wing` | string | No | Limit to one wing |
+| `apply` | boolean | No | Actually delete drawers; default is dry-run preview |
+
+**Returns:** `{ scanned, kept, gitignored, missing, no_source, out_of_scope, removed_drawers, removed_closets, dry_run, by_source }`
+
+---
+
 ### `mempalace_get_drawer`
 
 Fetch a single drawer by ID — returns full content and metadata.

From 0ff4121404457bc45595553252e3a88af4fb38d8 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 9 May 2026 04:01:34 +0500
Subject: [PATCH 114/127] fix(sync): symmetric source_file resolve + perf
 optimizations

CI fix: `_classify_drawer` now resolves `source_file` symmetric to
`project_roots` (which `_normalize_project_dirs` and
`_auto_detect_project_roots` already `.resolve()`). Without this, on
platforms where the temp directory is a symlink (macOS `/var/folders` ->
`/private/var/folders`, Windows 8.3 short-name normalization), every
drawer mis-bucketed as `out_of_scope` and survived prune.

Perf:
- `_resolve_project_root`: early-return on first match (sorted-desc
  precondition).
- `_normalize_project_dirs`: sort `(-len(str(p)), str(p))` desc for
  early-return + deterministic tie-break on equal-length paths.
- `_auto_detect_project_roots`: `seen_sources` dedupe so a 200-chunk
  file costs one disk walk, not 200.
- `sync_palace` main loop: per-file classification cache; registry
  sentinels (`_reg_*`, `room=_registry`, `ingest_mode=registry`) routed
  to "kept" before cache lookup so a sentinel sharing a `source_file`
  with a pruned drawer cannot inherit a stale "gitignored" verdict.
- Closet purge: collapse O(N) per-file purge into one
  `where={"source_file": {"$in": [...]}}` get + one bulk delete.

Tests (5 new in `TestSyncPalace`, 38 total):
- `test_symlinked_project_root_resolves`: pins symmetric resolve via
  real `os.symlink` (skipped on Windows).
- `test_classification_cache_avoids_redundant_disk_hits`: monkeypatch
  counter on `_classify_drawer` asserts `call_count == 1` for 5 chunks
  sharing one source_file.
- `test_closet_batch_purge_single_call`: wraps closets collection with
  `CallCountingCol` (forwards `.get`/`.delete`); asserts
  `delete_calls == 1` and `get_calls == 1`; expected `removed_closets`
  derived from `report["by_source"]` to stay robust to fixture changes.
- `test_registry_check_runs_before_cache_lookup`: a regular drawer
  caches "gitignored" first; a sentinel with the same source_file must
  still be kept.
- `test_normalize_project_dirs_sort_stable_on_equal_length`: pins the
  alphabetical secondary key when paths share length.
---
 mempalace/sync.py  |  73 ++++++++----
 tests/test_sync.py | 292 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 328 insertions(+), 37 deletions(-)

diff --git a/mempalace/sync.py b/mempalace/sync.py
index 8b4c2d9..788be85 100644
--- a/mempalace/sync.py
+++ b/mempalace/sync.py
@@ -22,7 +22,6 @@
     get_closets_collection,
     get_collection,
     mine_palace_lock,
-    purge_file_closets,
 )
 
 
@@ -44,16 +43,18 @@ class SyncReport(TypedDict):
 
 
 def _resolve_project_root(source_file: Path, project_roots: list) -> Optional[Path]:
-    """Return the longest project_root that source_file lives under."""
-    best: Optional[Path] = None
+    """Return the longest project_root that source_file lives under.
+
+    Assumes ``project_roots`` is sorted by path-length descending so the
+    first match is the longest (deepest) prefix.
+    """
     for root in project_roots:
         try:
             source_file.relative_to(root)
+            return root
         except ValueError:
             continue
-        if best is None or len(str(root)) > len(str(best)):
-            best = root
-    return best
+    return None
 
 
 def _ancestor_matchers(source_file: Path, root: Path, matcher_cache: dict) -> list:
@@ -102,6 +103,7 @@ def _classify_drawer(
 
     Returns one of: kept, gitignored, missing, no_source, out_of_scope.
     """
+    # Defensive: main loop filters registry rows; this guards direct callers.
     if _is_registry_row(meta, drawer_id):
         return "kept"
 
@@ -112,6 +114,7 @@ def _classify_drawer(
     src = Path(source_file)
     if not src.is_absolute():
         return "no_source"
+    src = src.resolve(strict=False)
 
     root = _resolve_project_root(src, project_roots)
     if root is None:
@@ -154,12 +157,17 @@ def _auto_detect_project_roots(col, wing: Optional[str]) -> list:
     a `.git` directory or a `.gitignore` file. The deepest such ancestor
     wins, so nested-but-still-tracked subprojects are honoured.
     `Path.parents` iterates deepest-first, so the first hit IS deepest.
+
+    Dedupes on ``source_file`` string so a 200-chunk file costs one disk
+    walk, not 200.
     """
-    roots = set()
+    roots: set = set()
+    seen_sources: set = set()
     for _, meta in _iter_drawer_metadata(col, wing):
         source_file = (meta or {}).get("source_file")
-        if not source_file:
+        if not source_file or source_file in seen_sources:
             continue
+        seen_sources.add(source_file)
         src = Path(source_file)
         if not src.is_absolute():
             continue
@@ -167,13 +175,13 @@ def _auto_detect_project_roots(col, wing: Optional[str]) -> list:
             if (parent / ".git").exists() or (parent / ".gitignore").is_file():
                 roots.add(parent.resolve(strict=False))
                 break
-    # Sort by depth (deepest first) with secondary lexicographic key for
-    # deterministic order when two roots share string length.
     return sorted(roots, key=lambda p: (-len(str(p)), str(p)))
 
 
 def _normalize_project_dirs(project_dirs) -> list:
-    return [Path(p).resolve(strict=False) for p in project_dirs]
+    """Resolve and sort project dirs so deepest-prefix wins on first match."""
+    resolved = [Path(p).resolve(strict=False) for p in project_dirs]
+    return sorted(resolved, key=lambda p: (-len(str(p)), str(p)))
 
 
 def _delete_in_batches(col, ids: list, batch_size: int, wal_log: Optional[Callable]):
@@ -246,17 +254,30 @@ def sync_palace(
             roots = _auto_detect_project_roots(col, wing)
 
         matcher_cache: dict = {}
+        # Same source_file → same verdict holds because mine_palace_lock
+        # blocks concurrent writers and the loop is synchronous.
+        classification_cache: dict = {}
 
         for drawer_id, meta in _iter_drawer_metadata(col, wing):
             counts["scanned"] += 1
-            bucket = _classify_drawer(meta or {}, matcher_cache, roots, drawer_id)
+            meta = meta or {}
+            source_file = meta.get("source_file")
+
+            if _is_registry_row(meta, drawer_id):
+                bucket = "kept"
+            elif source_file and source_file in classification_cache:
+                bucket = classification_cache[source_file]
+            else:
+                bucket = _classify_drawer(meta, matcher_cache, roots, drawer_id)
+                if source_file:
+                    classification_cache[source_file] = bucket
+
             counts[bucket] += 1
             if bucket in ("gitignored", "missing"):
                 removable_ids.append(drawer_id)
-                src = (meta or {}).get("source_file")
-                if src:
-                    removable_sources.add(src)
-                    by_source[src] += 1
+                if source_file:
+                    removable_sources.add(source_file)
+                    by_source[source_file] += 1
 
         report: SyncReport = {
             **counts,
@@ -278,15 +299,17 @@ def sync_palace(
             logger.warning("Closet purge skipped (collection unavailable): %s", exc)
 
         closets_removed = 0
-        if closets_col is not None:
-            for source_file in removable_sources:
-                before = (
-                    closets_col.get(where={"source_file": source_file}, include=[]).get("ids") or []
-                )
-                if not before:
-                    continue
-                purge_file_closets(closets_col, source_file)
-                closets_removed += len(before)
+        if closets_col is not None and removable_sources:
+            closet_ids = (
+                closets_col.get(
+                    where={"source_file": {"$in": list(removable_sources)}},
+                    include=[],
+                ).get("ids")
+                or []
+            )
+            if closet_ids:
+                closets_col.delete(ids=closet_ids)
+                closets_removed = len(closet_ids)
         report["removed_closets"] = closets_removed
     return report
 
diff --git a/tests/test_sync.py b/tests/test_sync.py
index f18261e..38cbf80 100644
--- a/tests/test_sync.py
+++ b/tests/test_sync.py
@@ -441,9 +441,9 @@ def fake_wal(operation, params, result=None):
         # Allow-list — params must be exactly the documented audit shape so
         # any future leak (source_file, content, ID lists, etc.) trips a
         # test failure rather than slipping through a deny-list.
-        assert set(params.keys()) <= {
-            "first_id"
-        }, f"WAL params drifted from the audit allow-list: {params.keys()}"
+        assert set(params.keys()) <= {"first_id"}, (
+            f"WAL params drifted from the audit allow-list: {params.keys()}"
+        )
 
     def test_registry_sentinels_preserved_on_apply(self, tmp_dir, palace_path):
         """F2 regression: convo miner `_reg_*` sentinels must survive sync apply.
@@ -564,9 +564,9 @@ def test_auto_detect_picks_deepest_root(self, tmp_dir, palace_path):
         inner_resolved = inner.resolve(strict=False)
         outer_resolved = outer.resolve(strict=False)
         assert inner_resolved in roots, f"expected inner in roots, got {roots}"
-        assert (
-            outer_resolved not in roots
-        ), f"deepest should win exclusively: roots={roots}, outer leaked"
+        assert outer_resolved not in roots, (
+            f"deepest should win exclusively: roots={roots}, outer leaked"
+        )
 
     def test_apply_with_empty_project_dirs_raises(self, palace_path):
         """Round-2 P1: `project_dirs=[]` (empty list) with apply must raise,
@@ -605,9 +605,9 @@ def boom(*args, **kwargs):
                 project_dirs=[synced_world["repo_path"]],
                 dry_run=False,
             )
-        assert any(
-            "Closet purge skipped" in record.getMessage() for record in caplog.records
-        ), f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
+        assert any("Closet purge skipped" in record.getMessage() for record in caplog.records), (
+            f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
+        )
 
     def test_metadata_cache_cleared_on_exception(self, monkeypatch, config, synced_world, kg):
         """F9 regression: tool_sync's try/finally must clear `_metadata_cache`
@@ -648,9 +648,9 @@ def explode(*args, **kwargs):
         assert result.get("success") is False
         assert "simulated" in result.get("error", "")
 
-        assert (
-            mcp_server._metadata_cache is None
-        ), "F9: cache must be cleared even when sync_palace raises"
+        assert mcp_server._metadata_cache is None, (
+            "F9: cache must be cleared even when sync_palace raises"
+        )
 
     def test_sync_report_keys_stable(self, synced_world):
         """Regression: SyncReport schema must not silently drop a field."""
@@ -897,6 +897,274 @@ def test_mine_already_running_propagates(self, synced_world):
             dry_run=True,
         )
 
+    @pytest.mark.skipif(os.name == "nt", reason="os.symlink needs admin on Windows")
+    def test_symlinked_project_root_resolves(self, tmp_dir, palace_path):
+        """source_file may be written through a symlinked tmp directory
+        (real macOS behaviour: /var/folders/... is a symlink to
+        /private/var/folders/...). project_dirs goes through .resolve()
+        which follows the symlink. Without matching .resolve() on the
+        source side, _resolve_project_root would mis-bucket every drawer
+        as out_of_scope. This test pins symmetric resolution.
+        """
+        from mempalace.sync import sync_palace
+
+        real_root = Path(tmp_dir) / "real"
+        (real_root / "build").mkdir(parents=True)
+        (real_root / ".gitignore").write_text("build/\n")
+        (real_root / "build" / "x.py").write_text("# ignored\n")
+
+        link_root = Path(tmp_dir) / "link"
+        os.symlink(str(real_root), str(link_root))
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_via_link"],
+            documents=["x"],
+            embeddings=[[1.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": str(link_root / "build" / "x.py"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(real_root)],
+            wing="demo",
+            dry_run=True,
+        )
+        assert report["gitignored"] == 1, (
+            f"symmetric resolve broken: drawer mis-bucketed; report={report}"
+        )
+        assert report["out_of_scope"] == 0
+
+    def test_classification_cache_avoids_redundant_disk_hits(
+        self, tmp_dir, palace_path, monkeypatch
+    ):
+        """Per-file classification cache: N chunks of the same source_file
+        cost one _classify_drawer invocation, not N. Verifies the perf
+        optimisation actually short-circuits without changing behaviour.
+        """
+        from mempalace import sync as sync_mod
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "build").mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("build/\n")
+        (repo_path / "build" / "shared.py").write_text("# ignored\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=[f"d_chunk_{i}" for i in range(5)],
+            documents=[f"chunk{i}" for i in range(5)],
+            embeddings=[[float(i + 1), 0.0, 0.0] for i in range(5)],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": str(repo_path / "build" / "shared.py"),
+                    "chunk_index": i,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                }
+                for i in range(5)
+            ],
+        )
+        del client
+
+        call_count = {"n": 0}
+        real_classify = sync_mod._classify_drawer
+
+        def counting_classify(*args, **kwargs):
+            call_count["n"] += 1
+            return real_classify(*args, **kwargs)
+
+        monkeypatch.setattr(sync_mod, "_classify_drawer", counting_classify)
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="demo",
+            dry_run=True,
+        )
+        assert report["scanned"] == 5
+        assert report["gitignored"] == 5
+        assert call_count["n"] == 1, (
+            f"cache miss: expected 1 _classify_drawer call (4 cache hits), got {call_count['n']}"
+        )
+
+    def test_closet_batch_purge_single_call(self, synced_world, monkeypatch):
+        """Batched $in closet purge: one delete() call across all removable
+        source files, not N. Wraps the real collection so chromadb still
+        does the work; only the call count is intercepted.
+        """
+        from mempalace import sync as sync_mod
+
+        repo_path = Path(synced_world["repo_path"])
+        palace_path = synced_world["palace_path"]
+
+        client = chromadb.PersistentClient(path=palace_path)
+        closets_col = client.get_or_create_collection(
+            "mempalace_closets", metadata={"hnsw:space": "cosine"}
+        )
+        closets_col.add(
+            ids=["c1", "c2", "c3"],
+            documents=["c1", "c2", "c3"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0], [3.0, 0.0, 0.0]],
+            metadatas=[
+                {"source_file": str(repo_path / "build" / "ignored.py")},
+                {"source_file": str(repo_path / "app.log")},
+                {"source_file": str(repo_path / "deleted.py")},
+            ],
+        )
+        del client
+
+        class CallCountingCol:
+            def __init__(self, real):
+                self._real = real
+                self.delete_calls = 0
+                self.get_calls = 0
+
+            def get(self, *args, **kwargs):
+                self.get_calls += 1
+                return self._real.get(*args, **kwargs)
+
+            def delete(self, *args, **kwargs):
+                self.delete_calls += 1
+                return self._real.delete(*args, **kwargs)
+
+        captured: dict = {}
+        real_get_closets = sync_mod.get_closets_collection
+
+        def wrapped_get_closets(p, create=False):
+            real = real_get_closets(p, create=create)
+            wrapper = CallCountingCol(real)
+            captured["wrapper"] = wrapper
+            return wrapper
+
+        monkeypatch.setattr(sync_mod, "get_closets_collection", wrapped_get_closets)
+
+        from mempalace.sync import sync_palace
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[synced_world["repo_path"]],
+            dry_run=False,
+        )
+
+        seeded_sources = {
+            str(repo_path / "build" / "ignored.py"),
+            str(repo_path / "app.log"),
+            str(repo_path / "deleted.py"),
+        }
+        expected = len(seeded_sources & set(report["by_source"].keys()))
+        assert report["removed_closets"] == expected, (
+            f"removed_closets ({report['removed_closets']}) != |seeded ∩ removable| ({expected})"
+        )
+        assert "wrapper" in captured, "get_closets_collection patch not invoked"
+        assert captured["wrapper"].delete_calls == 1, (
+            f"expected one batch delete call, got {captured['wrapper'].delete_calls}"
+        )
+        assert captured["wrapper"].get_calls == 1, (
+            f"expected one batch get call, got {captured['wrapper'].get_calls}"
+        )
+
+    def test_registry_check_runs_before_cache_lookup(self, tmp_dir, palace_path):
+        """A non-registry drawer with the same source_file must NOT poison
+        the bucket of a subsequent _reg_* drawer via the classification
+        cache. Order matters for chromadb iteration: seed the regular
+        drawer FIRST so it caches `gitignored`, then a registry sentinel
+        with the same source_file. Without the registry-bypass at the
+        top of the main loop, the cache lookup would route the sentinel
+        to gitignored and delete it.
+        """
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "build").mkdir(parents=True)
+        (repo_path / ".gitignore").write_text("build/\n")
+        (repo_path / "build" / "shared.py").write_text("# ignored\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        shared_source = str(repo_path / "build" / "shared.py")
+        col.add(
+            ids=["a_regular", "_reg_zzz_sentinel"],
+            documents=["regular chunk", "registry sentinel"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "demo",
+                    "room": "build",
+                    "source_file": shared_source,
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "demo",
+                    "room": "_registry",
+                    "source_file": shared_source,
+                    "chunk_index": 0,
+                    "ingest_mode": "registry",
+                    "added_by": "convo_miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="demo",
+            dry_run=False,
+        )
+        assert report["gitignored"] == 1
+        assert report["kept"] == 1
+        assert report["removed_drawers"] == 1
+
+        client, col = _open_drawers(palace_path)
+        try:
+            survivors = _drawer_ids(col)
+        finally:
+            del client
+        assert "a_regular" not in survivors
+        assert "_reg_zzz_sentinel" in survivors, (
+            "registry sentinel was incorrectly pruned via cached non-registry verdict"
+        )
+
+    def test_normalize_project_dirs_sort_stable_on_equal_length(self):
+        """`_normalize_project_dirs` must sort by `(-len, str)` so equal-length
+        roots are alphabetically deterministic; otherwise overlapping nested
+        scope choice depends on argv order.
+        """
+        from mempalace.sync import _normalize_project_dirs
+
+        result = _normalize_project_dirs(["/tmp/zzz", "/tmp/aaa"])
+        names = [p.name for p in result]
+        assert names == ["aaa", "zzz"], f"equal-length sort not deterministic: got {names}"
+
+        # Different lengths: deepest first.
+        deep = _normalize_project_dirs(["/tmp/short", "/tmp/much/deeper/path"])
+        assert str(deep[0]).endswith("path")
+        assert str(deep[1]).endswith("short")
+
 
 class TestSyncMcpTool:
     """T2: `mempalace_sync` MCP entry point must keep apply polarity stable."""

From 18f877869b90611ab1ca2f20b4a1d958feb17122 Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 9 May 2026 04:04:56 +0500
Subject: [PATCH 115/127] style(test_sync): match CI ruff 0.4.x format

---
 tests/test_sync.py | 60 +++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tests/test_sync.py b/tests/test_sync.py
index 38cbf80..6ab2251 100644
--- a/tests/test_sync.py
+++ b/tests/test_sync.py
@@ -441,9 +441,9 @@ def fake_wal(operation, params, result=None):
         # Allow-list — params must be exactly the documented audit shape so
         # any future leak (source_file, content, ID lists, etc.) trips a
         # test failure rather than slipping through a deny-list.
-        assert set(params.keys()) <= {"first_id"}, (
-            f"WAL params drifted from the audit allow-list: {params.keys()}"
-        )
+        assert set(params.keys()) <= {
+            "first_id"
+        }, f"WAL params drifted from the audit allow-list: {params.keys()}"
 
     def test_registry_sentinels_preserved_on_apply(self, tmp_dir, palace_path):
         """F2 regression: convo miner `_reg_*` sentinels must survive sync apply.
@@ -564,9 +564,9 @@ def test_auto_detect_picks_deepest_root(self, tmp_dir, palace_path):
         inner_resolved = inner.resolve(strict=False)
         outer_resolved = outer.resolve(strict=False)
         assert inner_resolved in roots, f"expected inner in roots, got {roots}"
-        assert outer_resolved not in roots, (
-            f"deepest should win exclusively: roots={roots}, outer leaked"
-        )
+        assert (
+            outer_resolved not in roots
+        ), f"deepest should win exclusively: roots={roots}, outer leaked"
 
     def test_apply_with_empty_project_dirs_raises(self, palace_path):
         """Round-2 P1: `project_dirs=[]` (empty list) with apply must raise,
@@ -605,9 +605,9 @@ def boom(*args, **kwargs):
                 project_dirs=[synced_world["repo_path"]],
                 dry_run=False,
             )
-        assert any("Closet purge skipped" in record.getMessage() for record in caplog.records), (
-            f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
-        )
+        assert any(
+            "Closet purge skipped" in record.getMessage() for record in caplog.records
+        ), f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
 
     def test_metadata_cache_cleared_on_exception(self, monkeypatch, config, synced_world, kg):
         """F9 regression: tool_sync's try/finally must clear `_metadata_cache`
@@ -648,9 +648,9 @@ def explode(*args, **kwargs):
         assert result.get("success") is False
         assert "simulated" in result.get("error", "")
 
-        assert mcp_server._metadata_cache is None, (
-            "F9: cache must be cleared even when sync_palace raises"
-        )
+        assert (
+            mcp_server._metadata_cache is None
+        ), "F9: cache must be cleared even when sync_palace raises"
 
     def test_sync_report_keys_stable(self, synced_world):
         """Regression: SyncReport schema must not silently drop a field."""
@@ -943,9 +943,9 @@ def test_symlinked_project_root_resolves(self, tmp_dir, palace_path):
             wing="demo",
             dry_run=True,
         )
-        assert report["gitignored"] == 1, (
-            f"symmetric resolve broken: drawer mis-bucketed; report={report}"
-        )
+        assert (
+            report["gitignored"] == 1
+        ), f"symmetric resolve broken: drawer mis-bucketed; report={report}"
         assert report["out_of_scope"] == 0
 
     def test_classification_cache_avoids_redundant_disk_hits(
@@ -1002,9 +1002,9 @@ def counting_classify(*args, **kwargs):
         )
         assert report["scanned"] == 5
         assert report["gitignored"] == 5
-        assert call_count["n"] == 1, (
-            f"cache miss: expected 1 _classify_drawer call (4 cache hits), got {call_count['n']}"
-        )
+        assert (
+            call_count["n"] == 1
+        ), f"cache miss: expected 1 _classify_drawer call (4 cache hits), got {call_count['n']}"
 
     def test_closet_batch_purge_single_call(self, synced_world, monkeypatch):
         """Batched $in closet purge: one delete() call across all removable
@@ -1071,16 +1071,16 @@ def wrapped_get_closets(p, create=False):
             str(repo_path / "deleted.py"),
         }
         expected = len(seeded_sources & set(report["by_source"].keys()))
-        assert report["removed_closets"] == expected, (
-            f"removed_closets ({report['removed_closets']}) != |seeded ∩ removable| ({expected})"
-        )
+        assert (
+            report["removed_closets"] == expected
+        ), f"removed_closets ({report['removed_closets']}) != |seeded ∩ removable| ({expected})"
         assert "wrapper" in captured, "get_closets_collection patch not invoked"
-        assert captured["wrapper"].delete_calls == 1, (
-            f"expected one batch delete call, got {captured['wrapper'].delete_calls}"
-        )
-        assert captured["wrapper"].get_calls == 1, (
-            f"expected one batch get call, got {captured['wrapper'].get_calls}"
-        )
+        assert (
+            captured["wrapper"].delete_calls == 1
+        ), f"expected one batch delete call, got {captured['wrapper'].delete_calls}"
+        assert (
+            captured["wrapper"].get_calls == 1
+        ), f"expected one batch get call, got {captured['wrapper'].get_calls}"
 
     def test_registry_check_runs_before_cache_lookup(self, tmp_dir, palace_path):
         """A non-registry drawer with the same source_file must NOT poison
@@ -1145,9 +1145,9 @@ def test_registry_check_runs_before_cache_lookup(self, tmp_dir, palace_path):
         finally:
             del client
         assert "a_regular" not in survivors
-        assert "_reg_zzz_sentinel" in survivors, (
-            "registry sentinel was incorrectly pruned via cached non-registry verdict"
-        )
+        assert (
+            "_reg_zzz_sentinel" in survivors
+        ), "registry sentinel was incorrectly pruned via cached non-registry verdict"
 
     def test_normalize_project_dirs_sort_stable_on_equal_length(self):
         """`_normalize_project_dirs` must sort by `(-len, str)` so equal-length

From 1822756afee3980f0e341eee716972c6dc38824b Mon Sep 17 00:00:00 2001
From: mvalentsev <michael@valentsev.ru>
Date: Sat, 9 May 2026 04:12:18 +0500
Subject: [PATCH 116/127] fix(test_sync): use tmp_dir for elsewhere path so it
 stays absolute on Windows

Windows runs treated `/tmp/elsewhere/x.md` as relative because Windows
absolute paths require a drive letter, so `_classify_drawer` routed
`drawer_out_of_scope` to `no_source` instead of `out_of_scope` and
`test_dry_run_classifies_correctly` failed on test-windows.

`Path(tmp_dir) / "elsewhere" / "x.md"` is absolute on every platform
and still lives outside the project root that the synced_world fixture
exposes via `repo_path`, so the bucket assertions hold cross-platform.
---
 tests/test_sync.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/test_sync.py b/tests/test_sync.py
index 6ab2251..ea17040 100644
--- a/tests/test_sync.py
+++ b/tests/test_sync.py
@@ -12,7 +12,7 @@
 import pytest
 
 
-def _seed_drawers(palace_path, repo_path, deleted_path):
+def _seed_drawers(palace_path, repo_path, deleted_path, elsewhere_path):
     """Populate the drawers collection with 6 entries covering all buckets."""
     client = chromadb.PersistentClient(path=palace_path)
     col = client.get_or_create_collection("mempalace_drawers", metadata={"hnsw:space": "cosine"})
@@ -61,7 +61,7 @@ def _seed_drawers(palace_path, repo_path, deleted_path):
         {
             "wing": "demo",
             "room": "elsewhere",
-            "source_file": "/tmp/elsewhere/x.md",
+            "source_file": str(elsewhere_path),
             "chunk_index": 0,
             "added_by": "miner",
             "filed_at": "2026-05-09T00:00:00",
@@ -104,7 +104,10 @@ def synced_world(tmp_dir, palace_path):
     deleted.write_text("# was here\n")
     deleted.unlink()
 
-    _seed_drawers(palace_path, repo_path, deleted)
+    # Use tmp_dir for an absolute path; `/tmp/...` literals are not absolute on Windows.
+    elsewhere = Path(tmp_dir) / "elsewhere" / "x.md"
+
+    _seed_drawers(palace_path, repo_path, deleted, elsewhere)
     return {"palace_path": palace_path, "repo_path": str(repo_path)}
 
 
From f34b78c8b187dc2f90d6307b76178f36169abfca Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Sat, 9 May 2026 11:20:33 +0200
Subject: [PATCH 117/127] Update conftest.py as part of fix(tests): isolate
 palace lock subprocess state

---
 tests/conftest.py | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index eb5c525..9a3b02d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,26 +36,35 @@
 
 @pytest.fixture(autouse=True)
 def _reset_mcp_cache():
-    """Reset the MCP server's cached ChromaDB client/collection between tests."""
+    """Reset cached MCP state between tests without importing mcp_server.
+
+    If mempalace.mcp_server is already imported, close/clear its KG cache and
+    Chroma client cache. If it has not been imported, leave it unloaded so
+    fork/spawn-based tests do not inherit extra Chroma/SQLite state.
+    """
 
     def _clear_cache():
         try:
-            from mempalace import mcp_server
-
-            for kg in list(getattr(mcp_server, "_kg_by_path", {}).values()):
-                close = getattr(kg, "close", None)
-                if close is not None:
-                    try:
-                        close()
-                    except Exception:
-                        pass
-            if hasattr(mcp_server, "_kg_by_path"):
-                mcp_server._kg_by_path.clear()
-
-            mcp_server._client_cache = None
-            mcp_server._collection_cache = None
-        except (ImportError, AttributeError):
+            import sys
+
+            mcp_server = sys.modules.get("mempalace.mcp_server")
+            if mcp_server is not None:
+                for kg in list(getattr(mcp_server, "_kg_by_path", {}).values()):
+                    close = getattr(kg, "close", None)
+                    if close is not None:
+                        try:
+                            close()
+                        except Exception:
+                            pass
+
+                if hasattr(mcp_server, "_kg_by_path"):
+                    mcp_server._kg_by_path.clear()
+
+                mcp_server._client_cache = None
+                mcp_server._collection_cache = None
+        except AttributeError:
             pass
+
         try:
             # Reset the per-process quarantine gate so tests don't leak
             # state through ChromaBackend._quarantined_paths.

From 00041ca736f68375e271473dd60bd4574b4c6b74 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Sat, 9 May 2026 11:30:45 +0200
Subject: [PATCH 118/127] Update test_palace_locks.py as part of fix(tests):
 isolate palace lock subprocess state

---
 tests/test_palace_locks.py | 49 +++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 2e9f82f..78e577c 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -23,15 +23,15 @@
 
 
 def _get_mp_context():
-    """Pick a start method that works on every CI runner.
+    """Return a clean multiprocessing context for palace-lock tests.
 
-    `fork` is cheaper (no re-import) but is unavailable on Windows, so we fall
-    back to `spawn` there. `spawn` inherits ``os.environ`` (including the
-    monkeypatched ``HOME``) and re-imports the ``mempalace`` package in the
-    child, which is sufficient for the lock-file semantics exercised here.
+    Always use ``spawn`` so child processes do not inherit the parent's open
+    file descriptors, flock state, SQLite handles, or Chroma/MCP module state.
+    This is slower than ``fork`` but much safer for the full test suite on
+    Linux/macOS, and it matches the behavior Windows already used.
     """
-    start_method = "spawn" if os.name == "nt" else "fork"
-    return multiprocessing.get_context(start_method)
+
+    return multiprocessing.get_context("spawn")
 
 
 # ---------------------------------------------------------------------------
@@ -175,28 +175,39 @@ def test_palace_path_is_normalized(tmp_path, monkeypatch):
 def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
     """Same thread re-acquiring the same palace lock must not deadlock or raise.
 
-    This is the invariant that makes ``ChromaCollection`` write methods (which
-    take ``mine_palace_lock`` for MCP/direct-writer protection) compose with
-    ``miner.mine()`` (which already holds the lock for the entire mine
-    pipeline). Without the per-thread re-entrant guard the inner acquire
-    would self-deadlock on the outer flock.
+    This is the invariant that makes ``ChromaCollection`` write methods
+    (which take ``mine_palace_lock`` for MCP/direct-writer protection)
+    compose with ``miner.mine()`` (which already holds the lock for the
+    entire mine pipeline). Without the per-thread re-entrant guard the inner
+    acquire would self-deadlock on the outer flock.
     """
     monkeypatch.setenv("HOME", str(tmp_path))
     palace = str(tmp_path / "palace")
+
     with mine_palace_lock(palace):
         # Re-enter from the same thread — must yield without raising or hanging.
         with mine_palace_lock(palace):
             pass
-        # After the inner exits, the outer is still held: confirm via a
-        # subprocess that tries to acquire and reports back.
+
+        # After the inner exits, the outer is still held. Use spawn so the
+        # child does not inherit the parent's open lock fd or SQLite/Chroma
+        # process state from the full test suite.
         ctx = _get_mp_context()
         result_q = ctx.Queue()
         child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
-        child.start()
-        child.join(timeout=5)
-        assert (
-            result_q.get(timeout=1) == "busy"
-        ), "outer lock should still be held by parent after inner re-entrant exit"
+
+        try:
+            child.start()
+            assert result_q.get(timeout=10) == "busy", (
+                "outer lock should still be held by parent after inner re-entrant exit"
+            )
+
+            child.join(timeout=5)
+            assert child.exitcode == 0
+        finally:
+            if child.is_alive():
+                child.terminate()
+                child.join(timeout=5)
 
 
 def _try_acquire_expect_busy(palace_path, result_q):

From 710fa529c0e9431ca6a754fda220c6eb3a31a251 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Sat, 9 May 2026 11:45:51 +0200
Subject: [PATCH 119/127] style: format test_palace_locks as part of
 fix(tests): isolate palace lock subprocess state

---
 tests/test_palace_locks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 78e577c..546a2d6 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -267,9 +267,9 @@ def test_lock_failure_message_names_holder(tmp_path, monkeypatch):
                 pytest.fail("second acquire of same palace should have raised")
 
         msg = str(excinfo.value)
-        assert (
-            f"PID {holder_pid}" in msg
-        ), f"lock-failure message must name the holder PID; got: {msg!r}"
+        assert f"PID {holder_pid}" in msg, (
+            f"lock-failure message must name the holder PID; got: {msg!r}"
+        )
     finally:
         open(release, "w").close()
         holder.join(timeout=5)

From 03f9a73d4447d6221196fde91cbcc1bd76660a04 Mon Sep 17 00:00:00 2001
From: fatkobra <55045047+fatkobra@users.noreply.github.com>
Date: Sat, 9 May 2026 12:16:26 +0200
Subject: [PATCH 120/127] fix(tests): format palace lock test

---
 tests/test_palace_locks.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 546a2d6..98b82d2 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -30,7 +30,6 @@ def _get_mp_context():
     This is slower than ``fork`` but much safer for the full test suite on
     Linux/macOS, and it matches the behavior Windows already used.
     """
-
     return multiprocessing.get_context("spawn")
 
 
@@ -183,25 +182,21 @@ def test_reentrant_same_thread_passes_through(tmp_path, monkeypatch):
     """
     monkeypatch.setenv("HOME", str(tmp_path))
     palace = str(tmp_path / "palace")
-
     with mine_palace_lock(palace):
         # Re-enter from the same thread — must yield without raising or hanging.
         with mine_palace_lock(palace):
             pass
-
         # After the inner exits, the outer is still held. Use spawn so the
         # child does not inherit the parent's open lock fd or SQLite/Chroma
         # process state from the full test suite.
         ctx = _get_mp_context()
         result_q = ctx.Queue()
         child = ctx.Process(target=_try_acquire_expect_busy, args=(palace, result_q))
-
         try:
             child.start()
             assert result_q.get(timeout=10) == "busy", (
                 "outer lock should still be held by parent after inner re-entrant exit"
             )
-
             child.join(timeout=5)
             assert child.exitcode == 0
         finally:

From b2ce45d30176c0923dbc255f9b71774c1fda8df5 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 9 May 2026 18:09:51 -0300
Subject: [PATCH 121/127] style: ruff format tests/test_mcp_server.py for ruff
 <0.5

---
 tests/test_mcp_server.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
index 95b173e..b73e906 100644
--- a/tests/test_mcp_server.py
+++ b/tests/test_mcp_server.py
@@ -573,9 +573,9 @@ def test_add_drawer_shared_header_no_collision(self, monkeypatch, config, palace
 
         assert result1["success"] is True
         assert result2["success"] is True
-        assert result1["drawer_id"] != result2["drawer_id"], (
-            "Documents with shared header but different content must have distinct drawer IDs"
-        )
+        assert (
+            result1["drawer_id"] != result2["drawer_id"]
+        ), "Documents with shared header but different content must have distinct drawer IDs"
 
     def test_delete_drawer(self, monkeypatch, config, palace_path, seeded_collection, kg):
         _patch_mcp_server(monkeypatch, config, kg)
@@ -1313,9 +1313,9 @@ def _spy_create(self, name, **kwargs):
         all_calls = captured["get"] + captured["create"]
         assert all_calls, "expected get_collection or create_collection to be called"
         for kwargs in all_calls:
-            assert "embedding_function" in kwargs, (
-                f"missing embedding_function= in chromadb call: {kwargs}"
-            )
+            assert (
+                "embedding_function" in kwargs
+            ), f"missing embedding_function= in chromadb call: {kwargs}"
             assert kwargs["embedding_function"] is not None
 
         # Same expectation on the create=False (cache-miss) reopen path.

From 11d0a642026ee061cbb4d645a4ea242b0049372c Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 9 May 2026 20:11:08 -0300
Subject: [PATCH 122/127] fix(tests): use spawn instead of fork for lock-test
 subprocesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_palace_locks.py and test_chroma_collection_lock.py spawned child
processes with the ``fork`` start method on POSIX. Under Python 3.13
this deadlocks reliably enough to hang the Linux 3.13 and macOS CI jobs
indefinitely while Linux 3.9 / 3.11 / Windows complete normally.

Root cause: by the time these tests run, the pytest parent process is
multi-threaded — chromadb and onnxruntime both spawn background threads
on import. ``fork`` snapshots the parent's address space into the
child without those threads, so any lock another thread held at fork
time stays locked in the child forever. Python 3.13 widened the window
where Python's own internal threads can be holding locks (hence the new
DeprecationWarning that fired ten times in our local 3.13 run).

macOS hits a related but distinct issue: Apple's CoreFoundation
explicitly forbids fork-without-exec; once anything in the parent has
loaded a CF-using library (ONNX, anything via Objective-C bridges) a
forked child will silently hang the moment it touches the same
library.

Switching to ``spawn`` re-imports modules in the child (~0.5s overhead
per Process — measurable but bounded), which is the standard fix for
both classes of bug. Lock-file semantics are unchanged: ``spawn``
inherits ``os.environ`` (including monkeypatched ``HOME``), which is
all these tests need from the parent.

Locally on Python 3.13: all 14 lock tests pass in 6.58s.
---
 tests/test_chroma_collection_lock.py | 10 +++++++---
 tests/test_palace_locks.py           | 18 ++++++++++--------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/test_chroma_collection_lock.py b/tests/test_chroma_collection_lock.py
index 536b5e8..086bcbf 100644
--- a/tests/test_chroma_collection_lock.py
+++ b/tests/test_chroma_collection_lock.py
@@ -36,9 +36,13 @@
 
 
 def _get_mp_context():
-    """Same start-method picker as test_palace_locks.py."""
-    start_method = "spawn" if os.name == "nt" else "fork"
-    return multiprocessing.get_context(start_method)
+    """Same start-method picker as test_palace_locks.py — ``spawn`` everywhere.
+
+    ``fork`` deadlocks under Python 3.13 when the parent is multi-threaded
+    (pytest + chromadb + onnxruntime), and macOS forbids fork-without-exec via
+    CoreFoundation. ``spawn`` is slower (re-imports) but safe.
+    """
+    return multiprocessing.get_context("spawn")
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_palace_locks.py b/tests/test_palace_locks.py
index 2e9f82f..3da143b 100644
--- a/tests/test_palace_locks.py
+++ b/tests/test_palace_locks.py
@@ -23,15 +23,17 @@
 
 
 def _get_mp_context():
-    """Pick a start method that works on every CI runner.
-
-    `fork` is cheaper (no re-import) but is unavailable on Windows, so we fall
-    back to `spawn` there. `spawn` inherits ``os.environ`` (including the
-    monkeypatched ``HOME``) and re-imports the ``mempalace`` package in the
-    child, which is sufficient for the lock-file semantics exercised here.
+    """Always use ``spawn`` — ``fork`` deadlocks under modern Python.
+
+    The parent (pytest + chromadb + onnxruntime) is multi-threaded by the time
+    these tests run. ``fork`` snapshots that state into the child without the
+    threads that hold the locks, which Python 3.13 explicitly warns about and
+    which deadlocks the CI runners. macOS additionally forbids
+    fork-without-exec via CoreFoundation. ``spawn`` re-imports the package in
+    the child (slower, but safe) and inherits ``os.environ`` — including the
+    monkeypatched ``HOME`` — which is all these lock-file tests need.
     """
-    start_method = "spawn" if os.name == "nt" else "fork"
-    return multiprocessing.get_context(start_method)
+    return multiprocessing.get_context("spawn")
 
 
 # ---------------------------------------------------------------------------

From df5ca114b2ff47f713a76c097defdc1eb99b83fb Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 9 May 2026 19:37:49 -0300
Subject: [PATCH 123/127] chore(tests): wrap sqlite3 connections in
 contextlib.closing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several tests opened sqlite3 connections without try/finally or
context-manager cleanup, relying on a flat conn.close() after the
work. Any assertion failure or exception between connect and close
leaked the connection until GC, producing
``ResourceWarning: unclosed database`` in CI logs.

On Python 3.13 / macOS the ResourceWarning isn't just noise: a
leaked connection can hold a SQLite advisory lock long enough for
later test setup to block on it, which appears to be the cause of
recent intermittent CI hangs on those two runners.

Wrap each affected ``conn = sqlite3.connect(...)`` block in
``contextlib.closing(...)`` so cleanup runs on the failure path too.
Mirrors the try/finally pattern already used in production code
(searcher.py, repair.py, backends/chroma.py).

No behavior change — same operations, same assertions, just
deterministic cleanup. All 162 affected tests pass locally.
---
 tests/test_backends.py | 121 +++++++++++++++++++----------------------
 tests/test_repair.py   | 102 +++++++++++++++++-----------------
 tests/test_sources.py  |  81 +++++++++++++--------------
 3 files changed, 144 insertions(+), 160 deletions(-)

diff --git a/tests/test_backends.py b/tests/test_backends.py
index d10d08a..90cf128 100644
--- a/tests/test_backends.py
+++ b/tests/test_backends.py
@@ -2,6 +2,7 @@
 import pickle
 import shutil
 import sqlite3
+from contextlib import closing
 from pathlib import Path
 
 import chromadb
@@ -452,37 +453,33 @@ def test_get_collection_create_true_preserves_existing_metadata(tmp_path):
 def test_fix_blob_seq_ids_converts_blobs_to_integers(tmp_path):
     """Simulate a ChromaDB 0.6.x database with BLOB seq_ids and verify repair."""
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
-    # Insert BLOB seq_id like ChromaDB 0.6.x would
-    blob_42 = (42).to_bytes(8, byteorder="big")
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (blob_42,))
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+        # Insert BLOB seq_id like ChromaDB 0.6.x would
+        blob_42 = (42).to_bytes(8, byteorder="big")
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (blob_42,))
+        conn.commit()
 
     _fix_blob_seq_ids(str(tmp_path))
 
-    conn = sqlite3.connect(str(db_path))
-    row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
-    assert row == (42, "integer")
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
+        assert row == (42, "integer")
 
 
 def test_fix_blob_seq_ids_noop_without_blobs(tmp_path):
     """No error when seq_ids are already integers."""
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
+        conn.commit()
 
     _fix_blob_seq_ids(str(tmp_path))
 
-    conn = sqlite3.connect(str(db_path))
-    row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
-    assert row == (42, "integer")
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
+        assert row == (42, "integer")
 
 
 def test_fix_blob_seq_ids_noop_without_database(tmp_path):
@@ -499,60 +496,56 @@ def test_fix_blob_seq_ids_does_not_touch_max_seq_id(tmp_path):
     silently suppressed every subsequent embeddings_queue write.
     """
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
-    conn.execute("CREATE TABLE max_seq_id (rowid INTEGER PRIMARY KEY, seq_id)")
-    sysdb10_blob = b"\x11\x11502607"
-    conn.execute("INSERT INTO max_seq_id (seq_id) VALUES (?)", (sysdb10_blob,))
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+        conn.execute("CREATE TABLE max_seq_id (rowid INTEGER PRIMARY KEY, seq_id)")
+        sysdb10_blob = b"\x11\x11502607"
+        conn.execute("INSERT INTO max_seq_id (seq_id) VALUES (?)", (sysdb10_blob,))
+        conn.commit()
 
     _fix_blob_seq_ids(str(tmp_path))
 
-    conn = sqlite3.connect(str(db_path))
-    row = conn.execute("SELECT seq_id, typeof(seq_id) FROM max_seq_id").fetchone()
-    assert row == (sysdb10_blob, "blob")
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        row = conn.execute("SELECT seq_id, typeof(seq_id) FROM max_seq_id").fetchone()
+        assert row == (sysdb10_blob, "blob")
 
 
 def test_fix_blob_seq_ids_skips_sysdb10_prefix_in_embeddings(tmp_path):
     """Defense-in-depth: sysdb-10 prefix in embeddings.seq_id is skipped."""
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
-    sysdb10_blob = b"\x11\x11502607"
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (sysdb10_blob,))
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+        sysdb10_blob = b"\x11\x11502607"
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (sysdb10_blob,))
+        conn.commit()
 
     _fix_blob_seq_ids(str(tmp_path))
 
-    conn = sqlite3.connect(str(db_path))
-    row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
-    # Still a BLOB — not converted to 1.23e18.
-    assert row == (sysdb10_blob, "blob")
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        row = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings").fetchone()
+        # Still a BLOB — not converted to 1.23e18.
+        assert row == (sysdb10_blob, "blob")
 
 
 def test_fix_blob_seq_ids_still_converts_legacy_blobs_in_embeddings(tmp_path):
     """Regression guard: pure big-endian u64 BLOBs still convert for genuine 0.6.x."""
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (b"\x11\x11502607",))
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((7).to_bytes(8, "big"),))
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", (b"\x11\x11502607",))
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((7).to_bytes(8, "big"),))
+        conn.commit()
 
     _fix_blob_seq_ids(str(tmp_path))
 
-    conn = sqlite3.connect(str(db_path))
-    rows = conn.execute("SELECT seq_id, typeof(seq_id) FROM embeddings ORDER BY rowid").fetchall()
-    assert rows[0] == (42, "integer")
-    assert rows[1] == (b"\x11\x11502607", "blob")  # sysdb-10 row left alone
-    assert rows[2] == (7, "integer")
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        rows = conn.execute(
+            "SELECT seq_id, typeof(seq_id) FROM embeddings ORDER BY rowid"
+        ).fetchall()
+        assert rows[0] == (42, "integer")
+        assert rows[1] == (b"\x11\x11502607", "blob")  # sysdb-10 row left alone
+        assert rows[2] == (7, "integer")
 
 
 def test_fix_blob_seq_ids_writes_marker_after_blob_path(tmp_path):
@@ -560,11 +553,10 @@ def test_fix_blob_seq_ids_writes_marker_after_blob_path(tmp_path):
     from mempalace.backends.chroma import _BLOB_FIX_MARKER
 
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id)")
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (?)", ((42).to_bytes(8, "big"),))
+        conn.commit()
 
     marker = tmp_path / _BLOB_FIX_MARKER
     assert not marker.exists()
@@ -585,11 +577,10 @@ def test_fix_blob_seq_ids_writes_marker_when_already_integer(tmp_path):
     from mempalace.backends.chroma import _BLOB_FIX_MARKER
 
     db_path = tmp_path / "chroma.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
-    conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.execute("CREATE TABLE embeddings (rowid INTEGER PRIMARY KEY, seq_id INTEGER)")
+        conn.execute("INSERT INTO embeddings (seq_id) VALUES (42)")
+        conn.commit()
 
     marker = tmp_path / _BLOB_FIX_MARKER
     assert not marker.exists()
diff --git a/tests/test_repair.py b/tests/test_repair.py
index 37651ba..9507c5d 100644
--- a/tests/test_repair.py
+++ b/tests/test_repair.py
@@ -2,6 +2,7 @@
 
 import os
 import sqlite3
+from contextlib import closing
 from unittest.mock import MagicMock, call, patch
 
 import pytest
@@ -809,63 +810,62 @@ def _seed_poisoned_max_seq_id(
     closets_vec = "seg-closets-vec-0000-1111-2222-333344445555"
     closets_meta = "seg-closets-meta-0000-1111-2222-33334444555"
 
-    conn = sqlite3.connect(db_path)
-    conn.executescript(
-        """
-        CREATE TABLE segments(
-            id TEXT PRIMARY KEY, type TEXT, scope TEXT, collection TEXT
-        );
-        CREATE TABLE max_seq_id(segment_id TEXT PRIMARY KEY, seq_id);
-        CREATE TABLE embeddings(
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            segment_id TEXT,
-            embedding_id TEXT,
-            seq_id
-        );
-        CREATE TABLE embeddings_queue(seq_id INTEGER PRIMARY KEY, topic TEXT, id TEXT);
-        CREATE TABLE collection_metadata(collection_id TEXT, key TEXT, str_value TEXT);
-        """
-    )
-    conn.executemany(
-        "INSERT INTO segments VALUES (?, ?, ?, ?)",
-        [
-            (drawers_vec, "urn:vector", "VECTOR", drawers_coll),
-            (drawers_meta, "urn:metadata", "METADATA", drawers_coll),
-            (closets_vec, "urn:vector", "VECTOR", closets_coll),
-            (closets_meta, "urn:metadata", "METADATA", closets_coll),
-        ],
-    )
-    conn.executemany(
-        "INSERT INTO max_seq_id(segment_id, seq_id) VALUES (?, ?)",
-        [
-            (drawers_vec, drawers_vec_poison),
-            (drawers_meta, drawers_meta_poison),
-            (closets_vec, closets_vec_poison),
-            (closets_meta, closets_meta_poison),
-        ],
-    )
-    # Populate embeddings so the collection-MAX heuristic has data to work with.
-    # drawers METADATA owns the max at drawers_meta_max; closets likewise.
-    for i in range(1, drawers_meta_max + 1, max(drawers_meta_max // 5, 1)):
+    with closing(sqlite3.connect(db_path)) as conn:
+        conn.executescript(
+            """
+            CREATE TABLE segments(
+                id TEXT PRIMARY KEY, type TEXT, scope TEXT, collection TEXT
+            );
+            CREATE TABLE max_seq_id(segment_id TEXT PRIMARY KEY, seq_id);
+            CREATE TABLE embeddings(
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                segment_id TEXT,
+                embedding_id TEXT,
+                seq_id
+            );
+            CREATE TABLE embeddings_queue(seq_id INTEGER PRIMARY KEY, topic TEXT, id TEXT);
+            CREATE TABLE collection_metadata(collection_id TEXT, key TEXT, str_value TEXT);
+            """
+        )
+        conn.executemany(
+            "INSERT INTO segments VALUES (?, ?, ?, ?)",
+            [
+                (drawers_vec, "urn:vector", "VECTOR", drawers_coll),
+                (drawers_meta, "urn:metadata", "METADATA", drawers_coll),
+                (closets_vec, "urn:vector", "VECTOR", closets_coll),
+                (closets_meta, "urn:metadata", "METADATA", closets_coll),
+            ],
+        )
+        conn.executemany(
+            "INSERT INTO max_seq_id(segment_id, seq_id) VALUES (?, ?)",
+            [
+                (drawers_vec, drawers_vec_poison),
+                (drawers_meta, drawers_meta_poison),
+                (closets_vec, closets_vec_poison),
+                (closets_meta, closets_meta_poison),
+            ],
+        )
+        # Populate embeddings so the collection-MAX heuristic has data to work with.
+        # drawers METADATA owns the max at drawers_meta_max; closets likewise.
+        for i in range(1, drawers_meta_max + 1, max(drawers_meta_max // 5, 1)):
+            conn.execute(
+                "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
+                (drawers_meta, f"d-{i}", i),
+            )
         conn.execute(
             "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
-            (drawers_meta, f"d-{i}", i),
+            (drawers_meta, "d-max", drawers_meta_max),
         )
-    conn.execute(
-        "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
-        (drawers_meta, "d-max", drawers_meta_max),
-    )
-    for i in range(1, closets_meta_max + 1, max(closets_meta_max // 5, 1)):
+        for i in range(1, closets_meta_max + 1, max(closets_meta_max // 5, 1)):
+            conn.execute(
+                "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
+                (closets_meta, f"c-{i}", i),
+            )
         conn.execute(
             "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
-            (closets_meta, f"c-{i}", i),
+            (closets_meta, "c-max", closets_meta_max),
         )
-    conn.execute(
-        "INSERT INTO embeddings(segment_id, embedding_id, seq_id) VALUES (?, ?, ?)",
-        (closets_meta, "c-max", closets_meta_max),
-    )
-    conn.commit()
-    conn.close()
+        conn.commit()
     return {
         "drawers_vec": drawers_vec,
         "drawers_meta": drawers_meta,
diff --git a/tests/test_sources.py b/tests/test_sources.py
index be24c32..cb01033 100644
--- a/tests/test_sources.py
+++ b/tests/test_sources.py
@@ -1,5 +1,8 @@
 """Tests for the RFC 002 source-adapter scaffolding."""
 
+import sqlite3
+from contextlib import closing
+
 import pytest
 
 from mempalace.sources import (
@@ -362,16 +365,13 @@ def test_knowledge_graph_add_triple_accepts_source_drawer_id_and_adapter_name(tm
         )
         assert triple_id is not None
 
-        import sqlite3
-
-        conn = sqlite3.connect(str(tmp_path / "kg.sqlite3"))
-        conn.row_factory = sqlite3.Row
-        row = conn.execute(
-            "SELECT source_drawer_id, adapter_name FROM triples WHERE id=?", (triple_id,)
-        ).fetchone()
-        assert row["source_drawer_id"] == "abc123_0"
-        assert row["adapter_name"] == "git"
-        conn.close()
+        with closing(sqlite3.connect(str(tmp_path / "kg.sqlite3"))) as conn:
+            conn.row_factory = sqlite3.Row
+            row = conn.execute(
+                "SELECT source_drawer_id, adapter_name FROM triples WHERE id=?", (triple_id,)
+            ).fetchone()
+            assert row["source_drawer_id"] == "abc123_0"
+            assert row["adapter_name"] == "git"
     finally:
         kg.close()
 
@@ -380,15 +380,12 @@ def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path):
     """Brand-new palaces should get source_drawer_id / adapter_name directly
     from CREATE TABLE, not via a post-hoc ALTER. _migrate_schema exists only
     for legacy palaces."""
-    import sqlite3
-
     from mempalace.knowledge_graph import KnowledgeGraph
 
     kg = KnowledgeGraph(db_path=str(tmp_path / "fresh.sqlite3"))
     try:
-        conn = sqlite3.connect(str(tmp_path / "fresh.sqlite3"))
-        cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
-        conn.close()
+        with closing(sqlite3.connect(str(tmp_path / "fresh.sqlite3"))) as conn:
+            cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
         assert "source_drawer_id" in cols
         assert "adapter_name" in cols
     finally:
@@ -397,42 +394,38 @@ def test_knowledge_graph_fresh_schema_includes_new_columns(tmp_path):
 
 def test_knowledge_graph_migration_adds_missing_columns_to_old_schema(tmp_path):
     """An old-schema triples table (pre-RFC 002) should auto-migrate on open."""
-    import sqlite3
-
     db_path = tmp_path / "legacy.sqlite3"
-    conn = sqlite3.connect(str(db_path))
-    conn.executescript("""
-        CREATE TABLE entities (
-            id TEXT PRIMARY KEY,
-            name TEXT NOT NULL,
-            type TEXT DEFAULT 'unknown',
-            properties TEXT DEFAULT '{}',
-            created_at TEXT DEFAULT CURRENT_TIMESTAMP
-        );
-        CREATE TABLE triples (
-            id TEXT PRIMARY KEY,
-            subject TEXT NOT NULL,
-            predicate TEXT NOT NULL,
-            object TEXT NOT NULL,
-            valid_from TEXT,
-            valid_to TEXT,
-            confidence REAL DEFAULT 1.0,
-            source_closet TEXT,
-            source_file TEXT,
-            extracted_at TEXT DEFAULT CURRENT_TIMESTAMP
-        );
-    """)
-    conn.commit()
-    conn.close()
+    with closing(sqlite3.connect(str(db_path))) as conn:
+        conn.executescript("""
+            CREATE TABLE entities (
+                id TEXT PRIMARY KEY,
+                name TEXT NOT NULL,
+                type TEXT DEFAULT 'unknown',
+                properties TEXT DEFAULT '{}',
+                created_at TEXT DEFAULT CURRENT_TIMESTAMP
+            );
+            CREATE TABLE triples (
+                id TEXT PRIMARY KEY,
+                subject TEXT NOT NULL,
+                predicate TEXT NOT NULL,
+                object TEXT NOT NULL,
+                valid_from TEXT,
+                valid_to TEXT,
+                confidence REAL DEFAULT 1.0,
+                source_closet TEXT,
+                source_file TEXT,
+                extracted_at TEXT DEFAULT CURRENT_TIMESTAMP
+            );
+        """)
+        conn.commit()
 
     from mempalace.knowledge_graph import KnowledgeGraph
 
     kg = KnowledgeGraph(db_path=str(db_path))
     try:
         # New columns must be present after _init_db runs the migration.
-        conn = sqlite3.connect(str(db_path))
-        cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
-        conn.close()
+        with closing(sqlite3.connect(str(db_path))) as conn:
+            cols = {row[1] for row in conn.execute("PRAGMA table_info(triples)")}
         assert "source_drawer_id" in cols
         assert "adapter_name" in cols
 

From fa9b7e0525a6afa96384942d4ce7083cac42fcd9 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 9 May 2026 21:11:13 -0300
Subject: [PATCH 124/127] chore(release): 3.3.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps version 3.3.4 → 3.3.5 across pyproject.toml, version.py, plugin
manifests, README badge, and uv.lock. Flips CHANGELOG.md from
``[3.3.5] — unreleased`` to ``[3.3.5] — 2026-05-09`` and adds entries
for the four PRs that landed after the bug-fix block was authored:

- Bug Fixes: #1396 (tool_search retry on transient HNSW flush)
- Documentation: #1385 (CONTRIBUTING git-identity guidance, closes #1317)
- Internal: #1431 (test multiprocessing fork → spawn)
- Internal: #1430 (test sqlite connection lifecycle via contextlib.closing)

The four open issues remaining on the v3.3.5 milestone (#1266, #1253,
#1092, #1082) have been moved to v3.4 — they form the concurrent-writer
/ HNSW corruption cluster that needs deeper work than this cycle could
absorb.
---
 .claude-plugin/marketplace.json |  2 +-
 .claude-plugin/plugin.json      |  2 +-
 .codex-plugin/plugin.json       |  2 +-
 CHANGELOG.md                    | 12 +++++++++++-
 README.md                       |  2 +-
 mempalace/version.py            |  2 +-
 pyproject.toml                  |  2 +-
 uv.lock                         |  2 +-
 8 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index 9320057..8cf08aa 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -9,7 +9,7 @@
       "name": "mempalace",
       "source": "./.claude-plugin",
       "description": "AI memory system — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, guided setup.",
-      "version": "3.3.4",
+      "version": "3.3.5",
       "author": {
         "name": "milla-jovovich"
       }
diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 3794c9d..cedc569 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "mempalace",
-  "version": "3.3.4",
+  "version": "3.3.5",
   "description": "Give your AI a memory — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, and guided setup.",
   "author": {
     "name": "milla-jovovich"
diff --git a/.codex-plugin/plugin.json b/.codex-plugin/plugin.json
index 02d0902..65083d4 100644
--- a/.codex-plugin/plugin.json
+++ b/.codex-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
   "name": "mempalace",
-  "version": "3.3.4",
+  "version": "3.3.5",
   "description": "Give your AI a memory — mine projects and conversations into a searchable palace. 19 MCP tools, auto-save hooks, and guided setup.",
   "author": {
     "name": "milla-jovovich"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index be5c14e..78d2ade 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,10 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ---
 
-## [3.3.5] — unreleased
+## [3.3.5] — 2026-05-09
 
 ### Bug Fixes
 
+- **MCP `tool_search` now retries once on transient `Error finding id` from chromadb's HNSW flush window.** After a bulk CLI mine, ChromaDB's HNSW segment metadata can be unflushed for ~30-60s; wing-scoped MCP search hits `Internal error: Error finding id` during that window. `tool_search` now detects this transient via response-shape sniffing, drops both the MCP-local client cache and `_DEFAULT_BACKEND._clients` / `_freshness` for the palace, sleeps 2s, and retries once. Successful retries are tagged with `index_recovered: true` so callers can observe when it fired; non-transient errors bypass the retry path entirely. Partial fix for the broader #1315 cluster — `tool_check_duplicate` and other index-touching tools still need the same wrapper. (#1396, refs #1082, #1315)
 - **`mempalace_diary_read` silently dropped entries on agent-name case mismatch.** `tool_diary_write` stored the `agent` metadata verbatim after `sanitize_name`, which preserves case, while `tool_diary_read` filtered by exact match. Writing as `"Claude"` and reading as `"claude"` (or vice-versa) returned zero rows. Both endpoints now lowercase `agent_name` immediately after sanitization, so reads are case-insensitive and the default per-agent wing slug is stable across casings. **Behavior change:** entries written prior to this fix under mixed-case agent names will not match the new lowercase filter; run `mempalace repair` if you need to migrate legacy diary metadata. (#1243)
 - **Knowledge-graph triples with `valid_to < valid_from` were silently invisible.** `KnowledgeGraph.query_entity()` filters with `valid_from <= as_of AND valid_to >= as_of`, so an inverted interval matches no `as_of` and the row is durably stored but unreachable — a P0 data-integrity foot-gun any caller that mixes up the two date params can hit. `add_triple()` now rejects inverted intervals at write time with a clear `ValueError` naming both bounds. Open intervals (one bound only) and point-in-time facts (`valid_from == valid_to`) remain accepted unchanged. (#1214)
 - **`ChromaBackend.close_palace()` / `close()` did not release the SQLite file lock.** Evicted clients sat in `_clients` without `close()`, and chromadb 1.5.x retains the rust-side SQLite lock until GC. Reopening the same palace path after `shutil.rmtree` + recreate within one process failed with `SQLITE_READONLY_DBMOVED` (code 1032). New `_close_client()` helper now calls `PersistentClient.close()` (with a try/except fallback for older chromadb) on `close_palace()`, on whole-backend `close()`, and on the `_client()` invalidation path that detects a missing `chroma.sqlite3`. The mtime/inode auto-invalidation branch is intentionally left alone — callers there may still hold a live `ChromaCollection`. (#1067, #1105)
@@ -21,6 +22,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 - **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
 - **`mempalace repair` can now recover palaces whose HNSW segment writer is stuck on `apply_logs`.** Both the existing `--mode legacy` rebuild and the inline `cli.cmd_repair` path call `Collection.count()` as their first read — exactly the call that raises `chromadb.errors.InternalError: Failed to apply logs to the hnsw segment writer` on the corruption class introduced upstream and reported in #1308. Repair would print `Cannot recover — palace may need to be re-mined from source files` even though the underlying SQLite tables were fully intact (the corruption lives in the on-disk index files, not the data layer). New `--mode from-sqlite` reads `(id, document, metadata)` rows directly from `chroma.sqlite3` via a `segments` → `embeddings` → `embedding_metadata` join, never opens a chromadb client against the corrupt palace, and re-upserts everything into a fresh palace at `--palace`. `--source PATH` extracts from a corrupt palace already moved aside; `--archive-existing` handles the in-place case by renaming the existing palace to `<palace>.pre-rebuild-<timestamp>` before reading from it. Documents are re-embedded under the user's configured embedding function (the original HNSW vectors live in the corrupt `data_level0.bin` and cannot be recovered, but the embedding model is deterministic so search results remain semantically equivalent). Verified end-to-end on a 52,300-row real-world corrupt palace. (#1308)
 
+### Documentation
+
+- **`CONTRIBUTING.md` git-identity guidance.** New section asks contributors to verify `git config user.name` and `git config user.email` before pushing, with an explicit warning for agentic coding tools that may not inherit the user's normal Git config. Avoids placeholder/template author values in commit history. (#1385, closes #1317)
+
+### Internal
+
+- **Test reliability: `multiprocessing` start method.** `tests/test_palace_locks.py` and `tests/test_chroma_collection_lock.py` switched from `fork` to `spawn` for child processes. Under Python 3.13 the pytest parent is multi-threaded by the time these tests run (chromadb + onnxruntime each spawn background threads on import); `fork` snapshotting that state into the child without the threads themselves deadlocked Linux 3.13 and macOS CI jobs indefinitely while Linux 3.9 / 3.11 / Windows finished normally. macOS additionally forbids fork-without-exec via CoreFoundation. `spawn` re-imports modules in the child (~0.5s per Process — bounded by the 10 subprocesses these tests fork) but is safe under threading. (#1431)
+- **Test cleanup: SQLite connection lifecycle.** Wrapped naked `conn = sqlite3.connect(...)` blocks in `tests/test_backends.py`, `tests/test_sources.py`, and `tests/test_repair.py` with `contextlib.closing(...)`. The flat `conn.close()` pattern at the end of each test leaked the connection on any exception or assertion failure between connect and close, producing `ResourceWarning: unclosed database` noise in CI logs and creating a secondary risk of advisory-lock starvation on Python 3.13 / macOS. Mirrors the `try/finally` pattern already used in production code. (#1430)
+
 ---
 
 ## [3.3.4] — unreleased
diff --git a/README.md b/README.md
index 28207f3..bc67637 100644
--- a/README.md
+++ b/README.md
@@ -190,7 +190,7 @@ PRs welcome. See [CONTRIBUTING.md](CONTRIBUTING.md).
 MIT — see [LICENSE](LICENSE).
 
 <!-- Link Definitions -->
-[version-shield]: https://img.shields.io/badge/version-3.3.4-4dc9f6?style=flat-square&labelColor=0a0e14
+[version-shield]: https://img.shields.io/badge/version-3.3.5-4dc9f6?style=flat-square&labelColor=0a0e14
 [release-link]: https://github.com/MemPalace/mempalace/releases
 [python-shield]: https://img.shields.io/badge/python-3.9+-7dd8f8?style=flat-square&labelColor=0a0e14&logo=python&logoColor=7dd8f8
 [python-link]: https://www.python.org/
diff --git a/mempalace/version.py b/mempalace/version.py
index 1db1b9d..a17619d 100644
--- a/mempalace/version.py
+++ b/mempalace/version.py
@@ -1,3 +1,3 @@
 """Single source of truth for the MemPalace package version."""
 
-__version__ = "3.3.4"
+__version__ = "3.3.5"
diff --git a/pyproject.toml b/pyproject.toml
index ae2ea27..580f777 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mempalace"
-version = "3.3.4"
+version = "3.3.5"
 description = "Give your AI a memory — mine projects and conversations into a searchable palace. No API key required."
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/uv.lock b/uv.lock
index 04f9303..2c96d67 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1169,7 +1169,7 @@ wheels = [
 
 [[package]]
 name = "mempalace"
-version = "3.3.4"
+version = "3.3.5"
 source = { editable = "." }
 dependencies = [
     { name = "chromadb" },

From 6e9d057a427ada86258376475384292d0c2d1c64 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sat, 9 May 2026 21:30:31 -0300
Subject: [PATCH 125/127] docs(changelog): backfill 3.3.4 release date

The 3.3.4 release shipped 2026-05-01 (per GitHub release v3.3.4) but
its CHANGELOG header was never flipped from ``unreleased`` to the
release date. Backfill while we're already touching CHANGELOG for
the 3.3.5 cut.
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 78d2ade..f2d8163 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 
 ---
 
-## [3.3.4] — unreleased
+## [3.3.4] — 2026-05-01
 
 ### Added
 

From a5ec32561c9bf32e42512da5307efd5033ab6eb2 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Sun, 10 May 2026 01:35:21 -0300
Subject: [PATCH 126/127] docs(changelog): correct KG date validator entry for
 3.3.5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copilot review on PR #1434 caught that the existing 3.3.5 entry
described the validator as it was authored under #1167 — accepting
``YYYY``/``YYYY-MM``/``YYYY-MM-DD`` and rejecting ISO datetimes — but
PR #1417 (closes #1374) merged into develop on 2026-05-10 and
inverted that: ``sanitize_iso_temporal()`` now rejects partial dates
and accepts canonical UTC datetimes (``YYYY-MM-DDTHH:MM:SSZ`` /
``+00:00``). ``sanitize_iso_date()`` is kept as a backwards-compat
wrapper.

Update the bullet to describe the *shipped* behavior, name both
functions, list both accepted and rejected forms, and call out the
3.3.4 → 3.3.5 behavior change for partial-date inputs that now error.
Reference both #1167 (original) and #1374/#1417 (the expansion).
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d52cc39..4a4b298 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,7 +18,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 - **`miner.detect_room` bidirectional substring matching caused systemic misrouting.** The priority-1 (path parts) and priority-2 (filename) checks used `c in part or part in c` against room names + keywords, so any token that was an unbounded substring of a room name (or vice versa) matched. Priority-1 iterates left-to-right and returns on first match, so `views/billing-page/src/Foo.test.tsx` routed to an `interviews` room because `"views" in "interviews"` matched before reaching `billing-page`. Both call sites now use a `_name_matches` helper that compares names as equal or as separator-bounded tokens of each other (split on `-`, `_`, `.`, `/`). (#1004, closes #1002)
 - **`mempalace compress` crashed on large palaces.** `regenerate_closets` fetched all closet_llm drawers in a single `col.get()`, which trips `SQLITE_MAX_VARIABLE_NUMBER` on palaces above ~32k drawers. Mirrors the #851 fix in `miner.py`: drawer fetch is now paginated at `batch_size=5000`. Per-source aggregation works across batches, so the LLM regeneration call still groups chunks correctly. (#1073, #1107)
 - **CLI and `fact_checker --stdin` mojibaked non-ASCII content on Windows.** Python defaults `sys.stdin`/`stdout`/`stderr` to the system ANSI codepage (cp1252/cp1251/cp950), so `mempalace search > out.txt` and piped fact_checker invocations corrupted Cyrillic / CJK drawer text at the process boundary. New `mempalace/_stdio.py` helper reconfigures all three streams to UTF-8 on `sys.platform == "win32"`, with per-stream `errors` policy: `surrogateescape` on stdin (preserves bad bytes from redirected files for the consumer's parser), `replace` on stdout/stderr (substitutes U+FFFD instead of `UnicodeEncodeError`-ing mid-print). With this, all three user-facing console_scripts (`mcp_server`, `hooks_cli`, `cli`/`fact_checker`) now reconfigure identically on Windows. (#1282)
-- **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_date()` validator in `config.py` accepts `YYYY`, `YYYY-MM`, `YYYY-MM-DD` (and passes through `None`/`""`); all three tools call it before values reach the storage layer. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; full ISO-8601 with time (`YYYY-MM-DDTHH:MM:SS`, timezone offsets) is not yet accepted — file an issue if you have a use case. (#1164, #1167)
+- **MCP knowledge-graph tools forwarded malformed date strings to SQLite.** `tool_kg_query` (`as_of`), `tool_kg_add` (`valid_from`), and `tool_kg_invalidate` (`ended`) accepted any string and produced empty result sets on natural-language inputs like `"March 2026"` or `"yesterday"` — callers (especially LLM agents) could not distinguish "no fact at this time" from "your date format was unrecognized." New `sanitize_iso_temporal()` validator in `config.py` (with `sanitize_iso_date()` retained as a backwards-compat wrapper) accepts `YYYY-MM-DD`, `YYYY-MM-DDTHH:MM:SSZ`, and `YYYY-MM-DDTHH:MM:SS+00:00` (normalized to the `Z` form), and passes `None`/`""` through unchanged; all three KG tools call it before values reach the storage layer. Partial dates (`YYYY`, `YYYY-MM`), naive datetimes, and non-UTC timezone offsets are rejected because KG queries compare TEXT temporal values where mixed formats silently return wrong results. **Behavior change:** previously-silent date typos now raise a clear `ValueError` naming the offending field; partial-date inputs that worked in 3.3.4 (`"2026"`, `"2026-05"`) no longer parse — pass a full `YYYY-MM-DD` or a canonical UTC datetime instead. (#1164, #1167, #1374, #1417)
 - **MCP server's `_kg` was a module-level singleton.** Multi-tenant hosts that rotate `MEMPALACE_PALACE_PATH` between tool calls hit the wrong sqlite file, because the KG was constructed once at import time while the ChromaDB side was already per-call via `_get_client()`. The KG is now resolved per-call through a lazy per-path cache (`_kg_by_path` keyed by `os.path.abspath`, with a double-checked-locking init under `_kg_cache_lock`). `tool_reconnect` drains and `close()`s cached KGs alongside the existing chroma reconnect. A `_call_kg` retry guard catches `sqlite3.ProgrammingError` once after a reconnect race. (#1136, #1160)
 - **`mempalace repair` can now recover palaces whose HNSW segment writer is stuck on `apply_logs`.** Both the existing `--mode legacy` rebuild and the inline `cli.cmd_repair` path call `Collection.count()` as their first read — exactly the call that raises `chromadb.errors.InternalError: Failed to apply logs to the hnsw segment writer` on the corruption class introduced upstream and reported in #1308. Repair would print `Cannot recover — palace may need to be re-mined from source files` even though the underlying SQLite tables were fully intact (the corruption lives in the on-disk index files, not the data layer). New `--mode from-sqlite` reads `(id, document, metadata)` rows directly from `chroma.sqlite3` via a `segments` → `embeddings` → `embedding_metadata` join, never opens a chromadb client against the corrupt palace, and re-upserts everything into a fresh palace at `--palace`. `--source PATH` extracts from a corrupt palace already moved aside; `--archive-existing` handles the in-place case by renaming the existing palace to `<palace>.pre-rebuild-<timestamp>` before reading from it. Documents are re-embedded under the user's configured embedding function (the original HNSW vectors live in the corrupt `data_level0.bin` and cannot be recovered, but the embedding model is deterministic so search results remain semantically equivalent). Verified end-to-end on a 52,300-row real-world corrupt palace. (#1308)
 

From edbec594df08244cc33f3a48660b98126dab1704 Mon Sep 17 00:00:00 2001
From: Kostadis Rousoss <kostadis@gmail.com>
Date: Sun, 17 May 2026 22:47:25 -0700
Subject: [PATCH 127/127] fix(sync): scope .mempalaceignore matchers per-wing +
 honor named palaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two sync bugs found during a Phandalin palace rebuild.

1. (critical, data-loss) `mempalace sync` was applying every ancestor
   `.mempalaceignore` from the user-passed project root down to the
   drawer's parent — including a root-level ignore file whose only
   purpose is to tell the ROOT wing's mine to skip each sub-wing's
   source dir. Result: every sub-wing drawer matched the ignore pattern
   and got flagged for deletion; `--apply` would wipe the wing.

   Fix: detect each drawer's per-wing source root by walking parents
   to the nearest `mempalace.yaml`; matcher loading starts there, so
   ignore files above the wing's own config marker are out of scope.
   Per-directory cache amortizes the walk across the wing.

2. (annoyance) `cmd_sync` was reading `--palace` as a raw filesystem
   path instead of routing through `_resolve_cli_palace`, so named-
   palace aliases worked for every other subcommand except `sync`.

Tests:
- root .mempalaceignore listing sub-wing dir leaves sub-wing drawers Kept
- per-wing .mempalaceignore inside the wing's root still takes effect
- root wing (whose mempalace.yaml lives at the project root) still sees
  the root ignore file
- CLI `--palace <alias>` resolves through the named-palace map in sync

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 mempalace/cli.py   |   2 +-
 mempalace/sync.py  |  65 ++++++++++-
 tests/test_sync.py | 265 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 298 insertions(+), 34 deletions(-)

diff --git a/mempalace/cli.py b/mempalace/cli.py
index 54e8b48..b4c72dd 100644
--- a/mempalace/cli.py
+++ b/mempalace/cli.py
@@ -613,7 +613,7 @@ def cmd_sync(args):
     from .palace import MineAlreadyRunning
     from .sync import sync_palace
 
-    palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
+    palace_path = _resolve_cli_palace(args)
 
     if not os.path.isdir(palace_path):
         print(f"\n  No palace found at {palace_path}")
diff --git a/mempalace/sync.py b/mempalace/sync.py
index 0505b07..5a02b66 100644
--- a/mempalace/sync.py
+++ b/mempalace/sync.py
@@ -42,6 +42,9 @@ class SyncReport(TypedDict):
     by_source: dict[str, int]
 
 
+_WING_CONFIG_NAMES = ("mempalace.yaml", "mempalace.yml", "mempal.yaml", "mempal.yml")
+
+
 def _resolve_project_root(source_file: Path, project_roots: list) -> Optional[Path]:
     """Return the longest project_root that source_file lives under.
 
@@ -57,6 +60,41 @@ def _resolve_project_root(source_file: Path, project_roots: list) -> Optional[Pa
     return None
 
 
+def _find_wing_source_root(source_file: Path, project_root: Path, wing_root_cache: dict) -> Path:
+    """Return the nearest ``mempalace.yaml``-rooted ancestor of ``source_file``
+    (within ``project_root``). Falls back to ``project_root`` when no per-wing
+    config marker is found.
+
+    The miner walks each wing's tree from its own ``mempalace.yaml`` dir, so
+    that directory — not a higher ancestor — is the matcher scope for the
+    wing's drawers. A root-level ``.mempalaceignore`` typically lists each
+    sub-wing's source dir so the ROOT wing's mine skips them; those patterns
+    are not in scope for sub-wing drawers, which were deliberately mined
+    from inside the listed dirs. Without this check, sync flags every sub-
+    wing drawer as gitignored and would delete them on ``--apply``.
+
+    Walks are cached per directory: once we decide a wing root for ``a/b/c``,
+    every chunk of every file under that subtree reuses the answer.
+    """
+    visited: list = []
+    candidate = source_file.parent
+    while True:
+        if candidate in wing_root_cache:
+            answer = wing_root_cache[candidate]
+            break
+        visited.append(candidate)
+        if any((candidate / name).is_file() for name in _WING_CONFIG_NAMES):
+            answer = candidate
+            break
+        if candidate == project_root or candidate.parent == candidate:
+            answer = project_root
+            break
+        candidate = candidate.parent
+    for d in visited:
+        wing_root_cache[d] = answer
+    return answer
+
+
 def _ancestor_matchers(source_file: Path, root: Path, matcher_cache: dict) -> list:
     """Build the ancestor-chain matcher list, root → file's parent.
 
@@ -97,11 +135,24 @@ def _is_registry_row(meta: dict, drawer_id: str) -> bool:
 
 
 def _classify_drawer(
-    meta: dict, matcher_cache: dict, project_roots: list, drawer_id: str = ""
+    meta: dict,
+    matcher_cache: dict,
+    project_roots: list,
+    drawer_id: str = "",
+    wing_root_cache: Optional[dict] = None,
 ) -> str:
     """Classify a drawer by its source_file metadata.
 
     Returns one of: kept, gitignored, missing, no_source, out_of_scope.
+
+    ``wing_root_cache`` (optional, recommended for production callers)
+    narrows the matcher scope to the drawer's per-wing source root — the
+    nearest ``mempalace.yaml`` ancestor — so a root-level ``.mempalaceignore``
+    that excludes sub-wing source dirs from the ROOT wing's mine does not
+    flag legitimate sub-wing drawers as gitignored. Without it the function
+    falls back to the user-supplied project_root, which is the historical
+    (buggy-for-multi-wing-palaces) behaviour and only safe for single-wing
+    palaces.
     """
     # Defensive: main loop filters registry rows; this guards direct callers.
     if _is_registry_row(meta, drawer_id):
@@ -123,7 +174,11 @@ def _classify_drawer(
     if not src.exists():
         return "missing"
 
-    matchers = _ancestor_matchers(src, root, matcher_cache)
+    if wing_root_cache is not None:
+        matcher_root = _find_wing_source_root(src, root, wing_root_cache)
+    else:
+        matcher_root = root
+    matchers = _ancestor_matchers(src, matcher_root, matcher_cache)
     if matchers and is_gitignored(src, matchers, is_dir=False):
         return "gitignored"
 
@@ -254,6 +309,10 @@ def sync_palace(
             roots = _auto_detect_project_roots(col, wing)
 
         matcher_cache: dict = {}
+        # Wing-root lookups (nearest mempalace.yaml ancestor) are cached
+        # per-directory so every drawer under one wing reuses the answer
+        # instead of re-walking the tree.
+        wing_root_cache: dict = {}
         # Same source_file → same verdict holds because mine_palace_lock
         # blocks concurrent writers and the loop is synchronous.
         classification_cache: dict = {}
@@ -268,7 +327,7 @@ def sync_palace(
             elif source_file and source_file in classification_cache:
                 bucket = classification_cache[source_file]
             else:
-                bucket = _classify_drawer(meta, matcher_cache, roots, drawer_id)
+                bucket = _classify_drawer(meta, matcher_cache, roots, drawer_id, wing_root_cache)
                 if source_file:
                     classification_cache[source_file] = bucket
 
diff --git a/tests/test_sync.py b/tests/test_sync.py
index ea17040..262a131 100644
--- a/tests/test_sync.py
+++ b/tests/test_sync.py
@@ -365,6 +365,178 @@ def test_nested_gitignore_layers(self, tmp_dir, palace_path):
         finally:
             del client
 
+    def test_root_mempalaceignore_does_not_flag_sub_wing_drawers(self, tmp_dir, palace_path):
+        """Bug: a root-level ``.mempalaceignore`` listing each sub-wing's
+        source dir (so the root mine skips them) must NOT flag the sub-
+        wing's drawers as gitignored. Sub-wings are mined from inside
+        those dirs and carry their own ``mempalace.yaml``; the root's
+        ignore patterns are out of scope for them.
+
+        Without the fix, ``sync --wing <sub>`` reports every sub-wing
+        drawer as "gitignored" and ``--apply`` wipes the wing.
+        """
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        chapters = repo_path / "docs" / "chapters"
+        chapters.mkdir(parents=True)
+        # Root .mempalaceignore excludes the narrative sub-wing's source dir
+        # from the ROOT wing's mine — mirrors the Phandalin bug report.
+        (repo_path / ".mempalaceignore").write_text("docs/chapters/\n")
+        # Each sub-wing has its own mempalace.yaml — the per-wing root marker.
+        (chapters / "mempalace.yaml").write_text("wing: narrative\nrooms: []\n")
+        (chapters / "chapter_01.md").write_text("# chapter 1\n")
+        (chapters / "chapter_02.md").write_text("# chapter 2\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_ch01", "d_ch02"],
+            documents=["c1", "c2"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "narrative",
+                    "room": "chapters",
+                    "source_file": str(chapters / "chapter_01.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "narrative",
+                    "room": "chapters",
+                    "source_file": str(chapters / "chapter_02.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="narrative",
+            dry_run=True,
+        )
+        assert report["scanned"] == 2
+        assert report["kept"] == 2
+        assert report["gitignored"] == 0
+        assert report["missing"] == 0
+
+    def test_sub_wing_local_mempalaceignore_still_honored(self, tmp_dir, palace_path):
+        """Per-wing ignore patterns inside the wing's own source root must
+        still take effect — only ancestors above the wing's mempalace.yaml
+        are out of scope.
+        """
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        chapters = repo_path / "docs" / "chapters"
+        (chapters / "build").mkdir(parents=True)
+        (repo_path / ".mempalaceignore").write_text("docs/chapters/\n")
+        (chapters / "mempalace.yaml").write_text("wing: narrative\nrooms: []\n")
+        # Sub-wing's OWN .mempalaceignore — should still apply.
+        (chapters / ".mempalaceignore").write_text("build/\n")
+        (chapters / "keep.md").write_text("# keep\n")
+        (chapters / "build" / "drop.md").write_text("# drop\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_keep", "d_drop"],
+            documents=["k", "d"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "narrative",
+                    "room": "chapters",
+                    "source_file": str(chapters / "keep.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "narrative",
+                    "room": "chapters",
+                    "source_file": str(chapters / "build" / "drop.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="narrative",
+            dry_run=True,
+        )
+        assert report["scanned"] == 2
+        assert report["kept"] == 1
+        assert report["gitignored"] == 1
+
+    def test_root_wing_with_own_yaml_still_uses_root_ignore(self, tmp_dir, palace_path):
+        """The root wing's drawers should still see the root ``.mempalaceignore``:
+        its wing source root IS the project root because its ``mempalace.yaml``
+        lives there. Anything matching the root's ignore patterns is correctly
+        flagged.
+        """
+        from mempalace.sync import sync_palace
+
+        repo_path = Path(tmp_dir) / "repo"
+        (repo_path / "junk").mkdir(parents=True)
+        (repo_path / "mempalace.yaml").write_text("wing: root\nrooms: []\n")
+        (repo_path / ".mempalaceignore").write_text("junk/\n")
+        (repo_path / "keep.md").write_text("# keep\n")
+        (repo_path / "junk" / "drop.md").write_text("# drop\n")
+
+        client = chromadb.PersistentClient(path=palace_path)
+        col = client.get_or_create_collection(
+            "mempalace_drawers", metadata={"hnsw:space": "cosine"}
+        )
+        col.add(
+            ids=["d_keep", "d_drop"],
+            documents=["k", "d"],
+            embeddings=[[1.0, 0.0, 0.0], [2.0, 0.0, 0.0]],
+            metadatas=[
+                {
+                    "wing": "root",
+                    "room": "general",
+                    "source_file": str(repo_path / "keep.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+                {
+                    "wing": "root",
+                    "room": "general",
+                    "source_file": str(repo_path / "junk" / "drop.md"),
+                    "chunk_index": 0,
+                    "added_by": "miner",
+                    "filed_at": "2026-05-09T00:00:00",
+                },
+            ],
+        )
+        del client
+
+        report = sync_palace(
+            palace_path=palace_path,
+            project_dirs=[str(repo_path)],
+            wing="root",
+            dry_run=True,
+        )
+        assert report["kept"] == 1
+        assert report["gitignored"] == 1
+
     def test_closet_purge_runs_on_apply(self, synced_world):
         """Closets pointing at removed sources must also disappear."""
         from mempalace.sync import sync_palace
@@ -444,9 +616,9 @@ def fake_wal(operation, params, result=None):
         # Allow-list — params must be exactly the documented audit shape so
         # any future leak (source_file, content, ID lists, etc.) trips a
         # test failure rather than slipping through a deny-list.
-        assert set(params.keys()) <= {
-            "first_id"
-        }, f"WAL params drifted from the audit allow-list: {params.keys()}"
+        assert set(params.keys()) <= {"first_id"}, (
+            f"WAL params drifted from the audit allow-list: {params.keys()}"
+        )
 
     def test_registry_sentinels_preserved_on_apply(self, tmp_dir, palace_path):
         """F2 regression: convo miner `_reg_*` sentinels must survive sync apply.
@@ -567,9 +739,9 @@ def test_auto_detect_picks_deepest_root(self, tmp_dir, palace_path):
         inner_resolved = inner.resolve(strict=False)
         outer_resolved = outer.resolve(strict=False)
         assert inner_resolved in roots, f"expected inner in roots, got {roots}"
-        assert (
-            outer_resolved not in roots
-        ), f"deepest should win exclusively: roots={roots}, outer leaked"
+        assert outer_resolved not in roots, (
+            f"deepest should win exclusively: roots={roots}, outer leaked"
+        )
 
     def test_apply_with_empty_project_dirs_raises(self, palace_path):
         """Round-2 P1: `project_dirs=[]` (empty list) with apply must raise,
@@ -608,9 +780,9 @@ def boom(*args, **kwargs):
                 project_dirs=[synced_world["repo_path"]],
                 dry_run=False,
             )
-        assert any(
-            "Closet purge skipped" in record.getMessage() for record in caplog.records
-        ), f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
+        assert any("Closet purge skipped" in record.getMessage() for record in caplog.records), (
+            f"expected closet-skip warning, got: {[r.getMessage() for r in caplog.records]}"
+        )
 
     def test_metadata_cache_cleared_on_exception(self, monkeypatch, config, synced_world, kg):
         """F9 regression: tool_sync's try/finally must clear `_metadata_cache`
@@ -651,9 +823,9 @@ def explode(*args, **kwargs):
         assert result.get("success") is False
         assert "simulated" in result.get("error", "")
 
-        assert (
-            mcp_server._metadata_cache is None
-        ), "F9: cache must be cleared even when sync_palace raises"
+        assert mcp_server._metadata_cache is None, (
+            "F9: cache must be cleared even when sync_palace raises"
+        )
 
     def test_sync_report_keys_stable(self, synced_world):
         """Regression: SyncReport schema must not silently drop a field."""
@@ -946,9 +1118,9 @@ def test_symlinked_project_root_resolves(self, tmp_dir, palace_path):
             wing="demo",
             dry_run=True,
         )
-        assert (
-            report["gitignored"] == 1
-        ), f"symmetric resolve broken: drawer mis-bucketed; report={report}"
+        assert report["gitignored"] == 1, (
+            f"symmetric resolve broken: drawer mis-bucketed; report={report}"
+        )
         assert report["out_of_scope"] == 0
 
     def test_classification_cache_avoids_redundant_disk_hits(
@@ -1005,9 +1177,9 @@ def counting_classify(*args, **kwargs):
         )
         assert report["scanned"] == 5
         assert report["gitignored"] == 5
-        assert (
-            call_count["n"] == 1
-        ), f"cache miss: expected 1 _classify_drawer call (4 cache hits), got {call_count['n']}"
+        assert call_count["n"] == 1, (
+            f"cache miss: expected 1 _classify_drawer call (4 cache hits), got {call_count['n']}"
+        )
 
     def test_closet_batch_purge_single_call(self, synced_world, monkeypatch):
         """Batched $in closet purge: one delete() call across all removable
@@ -1074,16 +1246,16 @@ def wrapped_get_closets(p, create=False):
             str(repo_path / "deleted.py"),
         }
         expected = len(seeded_sources & set(report["by_source"].keys()))
-        assert (
-            report["removed_closets"] == expected
-        ), f"removed_closets ({report['removed_closets']}) != |seeded ∩ removable| ({expected})"
+        assert report["removed_closets"] == expected, (
+            f"removed_closets ({report['removed_closets']}) != |seeded ∩ removable| ({expected})"
+        )
         assert "wrapper" in captured, "get_closets_collection patch not invoked"
-        assert (
-            captured["wrapper"].delete_calls == 1
-        ), f"expected one batch delete call, got {captured['wrapper'].delete_calls}"
-        assert (
-            captured["wrapper"].get_calls == 1
-        ), f"expected one batch get call, got {captured['wrapper'].get_calls}"
+        assert captured["wrapper"].delete_calls == 1, (
+            f"expected one batch delete call, got {captured['wrapper'].delete_calls}"
+        )
+        assert captured["wrapper"].get_calls == 1, (
+            f"expected one batch get call, got {captured['wrapper'].get_calls}"
+        )
 
     def test_registry_check_runs_before_cache_lookup(self, tmp_dir, palace_path):
         """A non-registry drawer with the same source_file must NOT poison
@@ -1148,9 +1320,9 @@ def test_registry_check_runs_before_cache_lookup(self, tmp_dir, palace_path):
         finally:
             del client
         assert "a_regular" not in survivors
-        assert (
-            "_reg_zzz_sentinel" in survivors
-        ), "registry sentinel was incorrectly pruned via cached non-registry verdict"
+        assert "_reg_zzz_sentinel" in survivors, (
+            "registry sentinel was incorrectly pruned via cached non-registry verdict"
+        )
 
     def test_normalize_project_dirs_sort_stable_on_equal_length(self):
         """`_normalize_project_dirs` must sort by `(-len, str)` so equal-length
@@ -1397,3 +1569,36 @@ def test_apply_without_scope_exits_2(self, monkeypatch, synced_world, capsys):
         with pytest.raises(SystemExit) as exc_info:
             cli.main()
         assert exc_info.value.code == 2
+
+    def test_named_palace_alias_resolves(self, monkeypatch, synced_world, capsys):
+        """Bug: ``--palace <alias>`` worked for status/mine/search but not
+        sync — cmd_sync was reading args.palace as a raw filesystem path.
+        Must route through the same named-palace resolver as every other
+        subcommand.
+        """
+        import json as _json
+        from mempalace import cli
+
+        cfg_path = os.path.join(os.environ["HOME"], ".mempalace", "config.json")
+        with open(cfg_path) as f:
+            cfg = _json.load(f)
+        original = _json.dumps(cfg)
+        cfg.setdefault("palaces", {})["sync_test_alias"] = synced_world["palace_path"]
+        with open(cfg_path, "w") as f:
+            _json.dump(cfg, f)
+        try:
+            argv = [
+                "mempalace",
+                "--palace",
+                "sync_test_alias",
+                "sync",
+                synced_world["repo_path"],
+            ]
+            monkeypatch.setattr("sys.argv", argv)
+            cli.main()
+            captured = capsys.readouterr().out
+            assert "No palace found" not in captured, captured
+            assert "Scanned:" in captured, captured
+        finally:
+            with open(cfg_path, "w") as f:
+                f.write(original)