From eb504ca12c7b0f453cf9374308d5baf02828a933 Mon Sep 17 00:00:00 2001
From: Rory Ford <me@roryford.com>
Date: Mon, 15 Jun 2026 20:15:34 +1000
Subject: [PATCH 1/3] feat(runtime): batteries-included compression policies
 (truncating / extractive / anchored)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MK shipped the CompressionPolicy / PreTurnCompressionPolicy seams but no
concrete strategy — every consumer had to hand-roll compress(). Add three
default strategies plus a single policy wrapper that conforms to both seams.

- TruncatingCompressionStrategy: zero-inference sliding window (baseline)
- ExtractiveCompressionStrategy: zero-inference scored selection (recency /
  length / keyword-density) with verbatim tail + optional headBudgetFraction
  knob (middle-out / lost-in-the-middle preservation)
- AnchoredCompressionStrategy: summarize old via generate, prepend a
  .memory("summary") record + keep a verbatim recency tail; chunk-and-fold for
  over-window input; falls back to extractive on failure/empty/missing-generate
- DefaultCompressionPolicy: Sendable struct, conforms to BOTH CompressionPolicy
  and PreTurnCompressionPolicy; static factories .truncating/.extractive/.anchored

Ported and rewritten from Fireside's StoryCompression suite against [ChatMessage].

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../AnchoredCompressionStrategy.swift         | 278 ++++++++++++++++++
 .../Compression/CompressionStrategy.swift     |  84 ++++++
 .../DefaultCompressionPolicy.swift            | 136 +++++++++
 .../ExtractiveCompressionStrategy.swift       | 128 ++++++++
 .../TruncatingCompressionStrategy.swift       |  48 +++
 .../DefaultCompressionPolicyTests.swift       | 200 +++++++++++++
 6 files changed, 874 insertions(+)
 create mode 100644 Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
 create mode 100644 Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
 create mode 100644 Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
 create mode 100644 Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
 create mode 100644 Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
 create mode 100644 Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift

diff --git a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
new file mode 100644
index 00000000..9aef9270
--- /dev/null
+++ b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
@@ -0,0 +1,278 @@
+import Foundation
+import ManifoldInference
+
+/// Summarises old messages via an inference call, then prepends the summary
+/// (as a `.memory("summary")` record) to a verbatim recency tail. Falls back
+/// to ``ExtractiveCompressionStrategy`` on any failure — failed or empty
+/// inference (including a no-op summariser), or an oversized summary.
+///
+/// Adapted from Fireside's `AnchoredStoryCompressor`, preserving its
+/// hard-won behaviors: input-window decoupling (`summarizerInputWindow`),
+/// chunk-and-fold for over-window input, a minimum-summary floor, and a
+/// cancellation early-return. The summary prompt is passed to `generate` as a
+/// single-message mini-conversation.
+struct AnchoredCompressionStrategy: CompressionStrategy {
+    let name = "anchored"
+
+    let tailBudgetFraction: Double
+    /// The summariser model's REAL usable window (tokens), used to size how
+    /// much old text it may READ. Decoupled from `contextSize`, which is the
+    /// small overflow *trigger* and the *output-brief* budget. `nil` sizes
+    /// input against `contextSize` (legacy behavior).
+    let summarizerInputWindow: Int?
+    /// Minimum tokens reserved for the output brief even when `contextSize` is
+    /// tiny — a short brief beats no brief.
+    let minSummaryBudget: Int
+    /// Tokens reserved for the summariser's own response when sizing input.
+    let summarizerResponseBuffer: Int
+    let summaryTemplate: String
+
+    private let fallback = ExtractiveCompressionStrategy()
+
+    /// Placeholder `{old_text}` is replaced with the concatenated old messages.
+    static let defaultSummaryTemplate = """
+        Summarize the conversation so far. Be concise. Use only what is in the text.
+
+        TOPIC: [main subject of the conversation, brief]
+        KEY POINTS: [up to 3 important points, semicolon-separated]
+        OPEN QUESTIONS: [unresolved items or pending decisions, if any]
+        LAST DISCUSSED: [most recent topic or conclusion, one sentence]
+
+        Conversation:
+        {old_text}
+        """
+
+    init(
+        tailBudgetFraction: Double = 0.50,
+        summarizerInputWindow: Int? = nil,
+        minSummaryBudget: Int = 256,
+        summarizerResponseBuffer: Int = 512,
+        summaryTemplate: String? = nil
+    ) {
+        self.tailBudgetFraction = tailBudgetFraction
+        self.summarizerInputWindow = summarizerInputWindow
+        self.minSummaryBudget = minSummaryBudget
+        self.summarizerResponseBuffer = summarizerResponseBuffer
+        self.summaryTemplate = summaryTemplate ?? Self.defaultSummaryTemplate
+    }
+
+    func compress(
+        history: [ChatMessage],
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)?,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage] {
+        guard !history.isEmpty else { return [] }
+
+        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) }
+        let originalTokens = tokens.reduce(0, +)
+        if originalTokens <= budget {
+            return history
+        }
+
+        let sessionID = history.first?.sessionID ?? UUID()
+        let count = history.count
+
+        // --- Verbatim tail: load-bearing records + newest within tailBudget ---
+        let tailBudget = Int(Double(budget) * tailBudgetFraction)
+        var tailIndices = Set<Int>()
+        var tailTokens = 0
+        for i in 0..<count where isLoadBearing(history[i]) {
+            tailIndices.insert(i)
+            tailTokens += tokens[i]
+        }
+        for i in stride(from: count - 1, through: 0, by: -1) {
+            if tailIndices.contains(i) { continue }
+            if tailTokens + tokens[i] <= tailBudget {
+                tailIndices.insert(i)
+                tailTokens += tokens[i]
+            } else if tailIndices.isEmpty {
+                tailIndices.insert(i)
+                tailTokens += tokens[i]
+                break
+            }
+        }
+        // Invariant: never drop the newest message.
+        if !tailIndices.contains(count - 1) {
+            tailIndices.insert(count - 1)
+            tailTokens += tokens[count - 1]
+        }
+
+        let tailMessages = tailIndices.sorted().map { history[$0] }
+        let oldMessages = (0..<count).filter { !tailIndices.contains($0) }.map { history[$0] }
+
+        if oldMessages.isEmpty {
+            return tailMessages
+        }
+
+        // --- Build summary prompt, sizing input against the REAL window ---
+        let oldText = oldMessages.map(\.content).joined(separator: "\n\n")
+        let oldTextTokens = ContextWindowManager.estimateTokenCount(oldText, tokenizer: tokenizer)
+        let inputWindow = max(summarizerInputWindow ?? contextSize, contextSize)
+        let usableInputBudget = max(0, inputWindow - summarizerResponseBuffer)
+
+        // Chunk-and-fold only when a REAL window was set AND the input genuinely
+        // exceeds it; otherwise a single summarisation call (step-9 truncation
+        // handles output overflow).
+        let canChunkAndFold = summarizerInputWindow != nil
+        let prompt: String
+        if canChunkAndFold && oldTextTokens > usableInputBudget && usableInputBudget > 0 {
+            Log.inference.warning(
+                "[AnchoredCompression] input \(oldTextTokens) tok exceeds usable window \(usableInputBudget); chunk-and-folding \(oldMessages.count) old messages"
+            )
+            prompt = await foldedSummaryPrompt(
+                oldMessages: oldMessages, chunkBudget: usableInputBudget,
+                sessionID: sessionID, generate: generate, tokenizer: tokenizer
+            )
+        } else {
+            prompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: oldText)
+        }
+
+        // --- Summarise (cancellation returns a minimal tail-only result) ---
+        let summaryText: String
+        do {
+            try Task.checkCancellation()
+            summaryText = try await generate([summaryMessage(prompt, sessionID: sessionID)])
+        } catch is CancellationError {
+            return tailMessages
+        } catch {
+            Log.inference.debug("[AnchoredCompression] summarisation failed: \(error); falling back to extractive")
+            return try await fallback.compress(
+                history: history, contextSize: contextSize,
+                tokenizer: tokenizer, generate: generate
+            )
+        }
+
+        guard !summaryText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
+            return try await fallback.compress(
+                history: history, contextSize: contextSize,
+                tokenizer: tokenizer, generate: generate
+            )
+        }
+
+        // --- Assemble: summary record + verbatim tail ---
+        let parsed = parseSummaryResponse(summaryText)
+        let summaryTokens = ContextWindowManager.estimateTokenCount(parsed, tokenizer: tokenizer)
+
+        var finalSummary = parsed
+        if summaryTokens + tailTokens > budget {
+            let rawSummaryBudget = budget - tailTokens
+            let summaryBudget = rawSummaryBudget > 0 ? rawSummaryBudget : min(minSummaryBudget, budget)
+            if rawSummaryBudget <= 0 {
+                Log.inference.warning(
+                    "[AnchoredCompression] tail consumes the whole budget (tail=\(tailTokens) >= budget=\(budget)); emitting floored \(summaryBudget)-tok brief"
+                )
+            }
+            let truncated = truncateToFit(parsed, budget: summaryBudget, tokenizer: tokenizer)
+            finalSummary = truncated.isEmpty ? String(parsed.prefix(200)) : truncated
+        }
+
+        var output: [ChatMessage] = [summaryRecord(finalSummary, sessionID: sessionID)]
+        output.append(contentsOf: tailMessages)
+        return output
+    }
+
+    // MARK: - Helpers
+
+    private func summaryMessage(_ prompt: String, sessionID: UUID) -> ChatMessage {
+        ChatMessage(role: .user, content: prompt, sessionID: sessionID)
+    }
+
+    private func summaryRecord(_ text: String, sessionID: UUID) -> ChatMessage {
+        ChatMessage(role: .system, content: text, sessionID: sessionID, kind: .memory("summary"))
+    }
+
+    /// Extracts `FIELD: value` lines and reassembles them; falls back to a
+    /// trimmed raw response when fewer than two fields are present.
+    private func parseSummaryResponse(_ response: String) -> String {
+        let pattern = "^([A-Z][A-Z _]*[A-Z]):\\s*(.+)$"
+        let regex: NSRegularExpression
+        do {
+            regex = try NSRegularExpression(pattern: pattern, options: [.anchorsMatchLines, .caseInsensitive])
+        } catch {
+            // The pattern is a compile-time constant; a failure here means a
+            // toolchain regression, not bad input. Surface it and fall back to
+            // the trimmed raw response rather than swallowing it silently.
+            Log.inference.error("[AnchoredCompression] summary regex failed to compile: \(error)")
+            let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines)
+            return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400))
+        }
+        let ns = response as NSString
+        var fields: [String] = []
+        for match in regex.matches(in: response, range: NSRange(location: 0, length: ns.length)) where match.numberOfRanges >= 3 {
+            let name = ns.substring(with: match.range(at: 1))
+            let value = ns.substring(with: match.range(at: 2)).trimmingCharacters(in: .whitespaces)
+            if !value.isEmpty { fields.append("\(name): \(value)") }
+        }
+        if fields.count >= 2 { return fields.joined(separator: "\n") }
+        let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines)
+        return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400))
+    }
+
+    /// Summarises over-window input in chunks, then folds the chunk summaries
+    /// into one prompt so the WHOLE old text is covered rather than truncated.
+    private func foldedSummaryPrompt(
+        oldMessages: [ChatMessage],
+        chunkBudget: Int,
+        sessionID: UUID,
+        generate: @Sendable ([ChatMessage]) async throws -> String,
+        tokenizer: (any TokenizerProvider)?
+    ) async -> String {
+        let templateOverhead = ContextWindowManager.estimateTokenCount(
+            summaryTemplate.replacingOccurrences(of: "{old_text}", with: ""),
+            tokenizer: tokenizer
+        )
+        let perChunkBudget = max(64, chunkBudget - templateOverhead)
+
+        var chunks: [[ChatMessage]] = []
+        var current: [ChatMessage] = []
+        var currentTokens = 0
+        for message in oldMessages {
+            let cost = estimateTokens(message, tokenizer: tokenizer)
+            if !current.isEmpty && currentTokens + cost > perChunkBudget {
+                chunks.append(current)
+                current = []
+                currentTokens = 0
+            }
+            current.append(message)
+            currentTokens += cost
+        }
+        if !current.isEmpty { chunks.append(current) }
+
+        var summaries: [String] = []
+        for chunk in chunks {
+            let chunkText = chunk.map(\.content).joined(separator: "\n\n")
+            let chunkPrompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: chunkText)
+            do {
+                let summary = try await generate([summaryMessage(chunkPrompt, sessionID: sessionID)])
+                if summary.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty {
+                    summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer))
+                } else {
+                    summaries.append(parseSummaryResponse(summary))
+                }
+            } catch {
+                // Preserve content even when a chunk's summarisation fails.
+                Log.inference.debug("[AnchoredCompression] chunk summarisation failed: \(error); keeping raw chunk")
+                summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer))
+            }
+        }
+        return summaryTemplate.replacingOccurrences(of: "{old_text}", with: summaries.joined(separator: "\n\n"))
+    }
+
+    /// Drops trailing words until the text fits the budget.
+    private func truncateToFit(_ text: String, budget: Int, tokenizer: (any TokenizerProvider)?) -> String {
+        if ContextWindowManager.estimateTokenCount(text, tokenizer: tokenizer) <= budget {
+            return text
+        }
+        var words = text.split(separator: " ", omittingEmptySubsequences: true)
+        while !words.isEmpty {
+            words.removeLast()
+            let candidate = words.joined(separator: " ")
+            if ContextWindowManager.estimateTokenCount(candidate, tokenizer: tokenizer) <= budget {
+                return candidate
+            }
+        }
+        return ""
+    }
+}
diff --git a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
new file mode 100644
index 00000000..47cc80c0
--- /dev/null
+++ b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
@@ -0,0 +1,84 @@
+import Foundation
+import ManifoldInference
+
+/// A pure, stateless algorithm that reduces a chat history to fit a token
+/// budget. Strategies are the reusable core shared by both compression seams
+/// (``CompressionPolicy`` post-turn and ``PreTurnCompressionPolicy``
+/// pre-turn); ``DefaultCompressionPolicy`` wraps a strategy with the trigger
+/// thresholds each protocol needs.
+///
+/// Unlike the ``CompressionPolicy``/``PreTurnCompressionPolicy`` `compress`
+/// methods — which receive only `history` + `sessionID` — a strategy also
+/// takes the `contextSize` and `tokenizer` it sizes against. The policy holds
+/// those as configuration and forwards them, because the protocol `compress`
+/// signatures intentionally do not.
+///
+/// ## System-prompt budgeting
+///
+/// A strategy sees only the `history` array. The session system prompt lives
+/// on `ChatSession.systemPrompt` and is *not* part of `history`, so the budget
+/// is sized against `contextSize` minus a fixed response buffer. Any `.system`
+/// role or `.memory` kind records that *are* in `history` are treated as
+/// load-bearing and preserved verbatim by every strategy.
+///
+/// Strategies are `Sendable` value types: `generate` is a parameter, never
+/// stored, so there is no mutable summariser handle to guard.
+protocol CompressionStrategy: Sendable {
+    /// Stable identifier recorded for diagnostics (e.g. `"truncating"`).
+    var name: String { get }
+
+    /// Reduce `history` to fit `contextSize`.
+    ///
+    /// - Parameters:
+    ///   - history: Full message history, oldest-first.
+    ///   - contextSize: Backend context window in tokens.
+    ///   - tokenizer: Optional tokenizer for cost estimation; heuristic when `nil`.
+    ///   - generate: Inference call for strategies that summarise. Zero-inference
+    ///     strategies ignore it. A closure that yields empty text signals "no
+    ///     usable summariser" — summarising strategies fall back accordingly.
+    /// - Returns: The replacement history, oldest-first. Never empty when
+    ///   `history` is non-empty.
+    func compress(
+        history: [ChatMessage],
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)?,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage]
+}
+
+// MARK: - Shared helpers
+
+extension CompressionStrategy {
+    /// Tokens reserved for the model's own response when sizing history.
+    var responseBuffer: Int { 512 }
+
+    /// Per-message token estimate.
+    func estimateTokens(_ message: ChatMessage, tokenizer: (any TokenizerProvider)?) -> Int {
+        ContextWindowManager.estimateTokenCount(message.content, tokenizer: tokenizer)
+    }
+
+    /// Total tokens across a message set.
+    func estimateTokens(_ messages: [ChatMessage], tokenizer: (any TokenizerProvider)?) -> Int {
+        messages.reduce(0) { $0 + estimateTokens($1, tokenizer: tokenizer) }
+    }
+
+    /// Token budget available for history: context window minus the response
+    /// buffer. The session system prompt is not visible here (it is not part
+    /// of `history`), so it is not subtracted.
+    func historyBudget(contextSize: Int, tokenizer: (any TokenizerProvider)?) -> Int {
+        max(0, contextSize - responseBuffer)
+    }
+
+    /// `true` for records that must survive compression regardless of budget:
+    /// `.system`-role prompt fragments and `.memory` summarisation artifacts
+    /// (a prior compression brief must not be evicted by a later pass).
+    ///
+    // TODO: per-message pinning is not visible to a `[ChatMessage]`-only
+    // policy — `ChatSession.pinnedMessageIDs` lives on the session. When the
+    // seam grows a pinned-IDs channel, honor it here.
+    func isLoadBearing(_ message: ChatMessage) -> Bool {
+        if message.role == .system { return true }
+        if case .memory = message.kind { return true }
+        return false
+    }
+}
diff --git a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
new file mode 100644
index 00000000..a295a8ac
--- /dev/null
+++ b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
@@ -0,0 +1,136 @@
+import Foundation
+import ManifoldInference
+
+/// Batteries-included compression policy. Pairs a ``CompressionStrategy`` with
+/// a utilisation threshold and conforms to **both** compression seams —
+/// ``CompressionPolicy`` (post-turn) and ``PreTurnCompressionPolicy``
+/// (pre-turn) — so one value drives either, or both, without a bespoke
+/// conformance.
+///
+/// Construct via the strategy factories rather than the initializer:
+///
+/// ```swift
+/// // Zero-inference, smart selection. Compress at 75% context utilisation.
+/// runtime.compressionPolicy = .extractive(threshold: 0.75, contextSize: 8_192)
+///
+/// // Inference-backed summary of old turns, kept-recent tail. Pre-turn seam.
+/// runtime.preTurnCompressionPolicy = .anchored(threshold: 0.85, contextSize: 8_192)
+///
+/// // Cheapest baseline: drop oldest, keep system/summary records + newest.
+/// runtime.compressionPolicy = .truncating(threshold: 0.90, contextSize: 4_096)
+/// ```
+///
+/// ## Trigger asymmetry
+///
+/// ``CompressionPolicy/shouldCompress(promptTokens:contextSize:contextUtilization:)``
+/// receives utilisation directly. ``PreTurnCompressionPolicy`` does not — its
+/// `shouldCompressBeforeTurn` sees only `messageCount` and `lastPromptTokens`,
+/// so this policy stores `contextSize` and computes utilisation from
+/// `lastPromptTokens / contextSize`. Given equivalent inputs the two seams
+/// agree.
+public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPolicy {
+    private let strategy: any CompressionStrategy
+
+    /// Context-utilisation ratio at or above which compression triggers.
+    public let threshold: Double
+    /// Backend context window (tokens). Held as configuration because the
+    /// pre-turn seam does not pass it and the strategy needs it to size budget.
+    public let contextSize: Int
+    private let tokenizer: (any TokenizerProvider)?
+
+    init(
+        strategy: any CompressionStrategy,
+        threshold: Double,
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)?
+    ) {
+        self.strategy = strategy
+        self.threshold = threshold
+        self.contextSize = contextSize
+        self.tokenizer = tokenizer
+    }
+
+    // MARK: - Factories
+
+    /// Zero-inference sliding window: keep system/summary records + the newest
+    /// messages that fit, drop the oldest.
+    public static func truncating(
+        threshold: Double = 0.90,
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)? = nil
+    ) -> DefaultCompressionPolicy {
+        DefaultCompressionPolicy(
+            strategy: TruncatingCompressionStrategy(),
+            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+        )
+    }
+
+    /// Zero-inference scored selection (recency / length / keyword density).
+    /// `headBudgetFraction > 0` pins the oldest messages to counter the
+    /// "lost in the middle" effect.
+    public static func extractive(
+        threshold: Double = 0.75,
+        headBudgetFraction: Double = 0.0,
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)? = nil
+    ) -> DefaultCompressionPolicy {
+        DefaultCompressionPolicy(
+            strategy: ExtractiveCompressionStrategy(headBudgetFraction: headBudgetFraction),
+            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+        )
+    }
+
+    /// Inference-backed summary of old turns prepended to a verbatim recent
+    /// tail. Falls back to extractive when no summary can be produced.
+    ///
+    /// - Parameter summarizerInputWindow: the summariser's REAL window, used to
+    ///   size how much old text it reads — set this to the backend's true
+    ///   context size when `contextSize` is a small overflow trigger.
+    public static func anchored(
+        threshold: Double = 0.85,
+        contextSize: Int,
+        summarizerInputWindow: Int? = nil,
+        summaryTemplate: String? = nil,
+        tokenizer: (any TokenizerProvider)? = nil
+    ) -> DefaultCompressionPolicy {
+        DefaultCompressionPolicy(
+            strategy: AnchoredCompressionStrategy(
+                summarizerInputWindow: summarizerInputWindow,
+                summaryTemplate: summaryTemplate
+            ),
+            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+        )
+    }
+
+    // MARK: - CompressionPolicy (post-turn)
+
+    public func shouldCompress(promptTokens: Int, contextSize: Int, contextUtilization: Double) -> Bool {
+        contextSize > 0 && contextUtilization >= threshold
+    }
+
+    public func compress(
+        history: [ChatMessage],
+        sessionID: UUID,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage] {
+        try await strategy.compress(
+            history: history, contextSize: contextSize,
+            tokenizer: tokenizer, generate: generate
+        )
+    }
+
+    // MARK: - PreTurnCompressionPolicy (pre-turn)
+
+    public func shouldCompressBeforeTurn(messageCount: Int, lastPromptTokens: Int?) -> Bool {
+        guard contextSize > 0, let promptTokens = lastPromptTokens else { return false }
+        return Double(promptTokens) / Double(contextSize) >= threshold
+    }
+
+    public func compressBeforeTurn(
+        history: [ChatMessage],
+        sessionID: UUID,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage] {
+        try await compress(history: history, sessionID: sessionID, generate: generate)
+    }
+}
diff --git a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
new file mode 100644
index 00000000..8332a5c0
--- /dev/null
+++ b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
@@ -0,0 +1,128 @@
+import Foundation
+import ManifoldInference
+
+/// Zero-inference scored selection. Reserves a verbatim recency tail (and,
+/// optionally, a verbatim head for establishing context), then greedily fills
+/// the remaining budget with the highest-scoring older messages. Score blends
+/// recency, content length, and capitalized-word density (a cheap proper-noun
+/// proxy).
+///
+/// Adapted from Fireside's `ExtractiveStoryCompressor`. The `headBudgetFraction`
+/// knob is new: pinning the oldest messages counters the "lost in the middle"
+/// effect, where models attend most to the start and end of context.
+struct ExtractiveCompressionStrategy: CompressionStrategy {
+    let name = "extractive"
+
+    /// Fraction of the budget reserved for the verbatim recency tail.
+    let tailBudgetFraction: Double
+    /// Fraction of the budget reserved for a verbatim head (oldest messages).
+    /// `0` disables head preservation (Fireside's original behavior).
+    let headBudgetFraction: Double
+    let recencyWeight: Double
+    let lengthWeight: Double
+    let keywordDensityWeight: Double
+
+    init(
+        tailBudgetFraction: Double = 0.40,
+        headBudgetFraction: Double = 0.0,
+        recencyWeight: Double = 0.5,
+        lengthWeight: Double = 0.3,
+        keywordDensityWeight: Double = 0.2
+    ) {
+        self.tailBudgetFraction = tailBudgetFraction
+        self.headBudgetFraction = headBudgetFraction
+        self.recencyWeight = recencyWeight
+        self.lengthWeight = lengthWeight
+        self.keywordDensityWeight = keywordDensityWeight
+    }
+
+    func compress(
+        history: [ChatMessage],
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)?,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage] {
+        guard !history.isEmpty else { return [] }
+
+        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) }
+        let originalTokens = tokens.reduce(0, +)
+
+        // Everything fits, or a single message: never evict all history.
+        if originalTokens <= budget || history.count == 1 {
+            return history
+        }
+
+        let count = history.count
+        var keep = Set<Int>()
+        var used = 0
+
+        // Load-bearing records are always kept.
+        for i in 0..<count where isLoadBearing(history[i]) {
+            keep.insert(i)
+            used += tokens[i]
+        }
+
+        // --- Verbatim tail (newest) ---
+        let tailBudget = Int(Double(budget) * tailBudgetFraction)
+        var tailUsed = 0
+        for i in stride(from: count - 1, through: 0, by: -1) {
+            if keep.contains(i) { continue }
+            if tailUsed + tokens[i] <= tailBudget || keep.isEmpty {
+                keep.insert(i)
+                tailUsed += tokens[i]
+                used += tokens[i]
+            }
+            if tailUsed >= tailBudget { break }
+        }
+        // Always preserve the newest message.
+        if !keep.contains(count - 1) {
+            keep.insert(count - 1)
+            used += tokens[count - 1]
+        }
+
+        // --- Verbatim head (oldest) — anti "lost in the middle" ---
+        if headBudgetFraction > 0 {
+            let headBudget = Int(Double(budget) * headBudgetFraction)
+            var headUsed = 0
+            for i in 0..<count {
+                if keep.contains(i) { continue }
+                if headUsed + tokens[i] <= headBudget {
+                    keep.insert(i)
+                    headUsed += tokens[i]
+                    used += tokens[i]
+                }
+                if headUsed >= headBudget { break }
+            }
+        }
+
+        // --- Score and greedily select the remainder ---
+        struct Scored { let index: Int; let score: Double; let tokens: Int }
+        var candidates: [Scored] = []
+        for i in 0..<count where !keep.contains(i) {
+            let recency = count > 1 ? Double(i) / Double(count - 1) : 1.0
+            let length = min(1.0, Double(tokens[i]) / 200.0)
+            let density = keywordDensity(of: history[i].content)
+            let score = recency * recencyWeight + length * lengthWeight + density * keywordDensityWeight
+            candidates.append(Scored(index: i, score: score, tokens: tokens[i]))
+        }
+        candidates.sort { $0.score > $1.score }
+
+        for candidate in candidates {
+            if used + candidate.tokens <= budget {
+                keep.insert(candidate.index)
+                used += candidate.tokens
+            }
+        }
+
+        return keep.sorted().map { history[$0] }
+    }
+
+    /// Ratio of capitalized words to total words — a rough proper-noun proxy.
+    private func keywordDensity(of content: String) -> Double {
+        let words = content.split(whereSeparator: { $0.isWhitespace })
+        guard !words.isEmpty else { return 0 }
+        let capitalized = words.filter { $0.first?.isUppercase == true }.count
+        return Double(capitalized) / Double(words.count)
+    }
+}
diff --git a/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
new file mode 100644
index 00000000..467d4cf5
--- /dev/null
+++ b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
@@ -0,0 +1,48 @@
+import Foundation
+import ManifoldInference
+
+/// Zero-inference sliding window. Keeps every load-bearing record
+/// (`.system` / `.memory`) plus the newest messages that fit the budget,
+/// dropping the oldest. The cheapest possible strategy and a safe default when
+/// no summariser is available.
+struct TruncatingCompressionStrategy: CompressionStrategy {
+    let name = "truncating"
+
+    func compress(
+        history: [ChatMessage],
+        contextSize: Int,
+        tokenizer: (any TokenizerProvider)?,
+        generate: @Sendable ([ChatMessage]) async throws -> String
+    ) async throws -> [ChatMessage] {
+        guard !history.isEmpty else { return [] }
+
+        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        if estimateTokens(history, tokenizer: tokenizer) <= budget {
+            return history
+        }
+
+        var kept = Set<Int>()
+        var used = 0
+
+        // Load-bearing records are always retained.
+        for (i, message) in history.enumerated() where isLoadBearing(message) {
+            kept.insert(i)
+            used += estimateTokens(message, tokenizer: tokenizer)
+        }
+
+        // Fill remaining budget with the newest non-kept messages.
+        for i in stride(from: history.count - 1, through: 0, by: -1) {
+            if kept.contains(i) { continue }
+            let cost = estimateTokens(history[i], tokenizer: tokenizer)
+            if kept.isEmpty || used + cost <= budget {
+                kept.insert(i)
+                used += cost
+            }
+        }
+
+        // Invariant: never drop the newest message.
+        kept.insert(history.count - 1)
+
+        return kept.sorted().map { history[$0] }
+    }
+}
diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
new file mode 100644
index 00000000..31f603cc
--- /dev/null
+++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
@@ -0,0 +1,200 @@
+@preconcurrency import XCTest
+import Foundation
+@testable import ManifoldRuntime
+@testable import ManifoldInference
+
+/// Unit coverage for the batteries-included compression strategies and the
+/// ``DefaultCompressionPolicy`` wrapper. Pure value transforms — no SwiftData,
+/// no runtime wiring (that lives in `CompressionPolicyTests`).
+final class DefaultCompressionPolicyTests: XCTestCase {
+
+    // MARK: - Fixtures
+
+    private let sessionID = UUID()
+
+    /// Builds a message whose token cost scales with `words`.
+    private func msg(_ role: MessageRole, words: Int, kind: MessageKind = .chat) -> ChatMessage {
+        let content = Array(repeating: "lorem", count: words).joined(separator: " ")
+        return ChatMessage(role: role, content: content, sessionID: sessionID, kind: kind)
+    }
+
+    /// Long alternating conversation that overflows `contextSize`.
+    private func overflowingHistory(turns: Int = 12, words: Int = 120) -> [ChatMessage] {
+        (0..<turns).map { msg($0.isMultiple(of: 2) ? .user : .assistant, words: words) }
+    }
+
+    // Sized so the ~2.1k-token fixtures overflow the budget (≈1.5 tok/word
+    // under the heuristic tokenizer): contextSize - responseBuffer = 1_536.
+    private let contextSize = 2_048
+    private func budget() -> Int { max(0, contextSize - 512) }
+    private func tokens(_ messages: [ChatMessage]) -> Int {
+        messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1.content, tokenizer: nil) }
+    }
+
+    private static func echoGenerate(_: [ChatMessage]) async throws -> String {
+        "TOPIC: testing\nKEY POINTS: a; b; c\nLAST DISCUSSED: the end"
+    }
+
+    // MARK: - Truncating
+
+    func testTruncatingLeavesSmallHistoryUntouched() async throws {
+        let history = [msg(.user, words: 5), msg(.assistant, words: 5)]
+        let out = try await TruncatingCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+        XCTAssertEqual(out.map(\.id), history.map(\.id))
+    }
+
+    func testTruncatingDropsOldestAndKeepsNewest() async throws {
+        let history = overflowingHistory()
+        XCTAssertGreaterThan(tokens(history), budget())  // precondition: actually overflows
+
+        let out = try await TruncatingCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+
+        XCTAssertLessThan(out.count, history.count, "expected eviction")
+        XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive")
+        XCTAssertFalse(out.contains { $0.id == history.first?.id }, "oldest should be dropped")
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+    }
+
+    func testTruncatingPreservesLoadBearingRecords() async throws {
+        var history = [msg(.system, words: 10, kind: .chat)]            // system role
+        history.append(msg(.assistant, words: 10, kind: .memory("summary")))  // memory kind
+        history.append(contentsOf: overflowingHistory())
+
+        let out = try await TruncatingCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+
+        XCTAssertTrue(out.contains { $0.role == .system }, "system prompt must survive")
+        XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false },
+                      "prior summary must survive")
+    }
+
+    // MARK: - Extractive
+
+    func testExtractiveReducesBelowBudgetAndKeepsNewest() async throws {
+        let history = overflowingHistory()
+        let out = try await ExtractiveCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+        XCTAssertEqual(out.last?.id, history.last?.id)
+        XCTAssertEqual(out.map(\.id), out.map(\.id).sorted { a, b in
+            (history.firstIndex { $0.id == a } ?? 0) < (history.firstIndex { $0.id == b } ?? 0)
+        }, "output must stay chronological")
+    }
+
+    func testExtractiveSingleMessageNeverEvicted() async throws {
+        let history = [msg(.user, words: 5_000)]  // alone but over budget
+        let out = try await ExtractiveCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+        XCTAssertEqual(out.count, 1)
+    }
+
+    func testExtractiveHeadBudgetPreservesOldest() async throws {
+        let history = overflowingHistory(turns: 16, words: 120)
+        let oldestID = history.first!.id
+
+        let withoutHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.0).compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+        let withHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.30).compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+
+        // The head knob guarantees the oldest establishing message survives.
+        XCTAssertTrue(withHead.contains { $0.id == oldestID }, "head budget must retain the oldest message")
+        XCTAssertFalse(withoutHead.contains { $0.id == oldestID }, "without head budget the oldest is evictable")
+    }
+
+    // MARK: - Anchored
+
+    func testAnchoredPrependsMemorySummary() async throws {
+        let history = overflowingHistory()
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: Self.echoGenerate)
+
+        let first = try XCTUnwrap(out.first)
+        XCTAssertEqual(first.role, .system)
+        guard case .memory(let label) = first.kind else {
+            return XCTFail("first record must be a .memory summary")
+        }
+        XCTAssertEqual(label, "summary")
+        XCTAssertEqual(out.last?.id, history.last?.id, "verbatim tail preserved")
+    }
+
+    func testAnchoredFallsBackToExtractiveWhenGenerateFails() async throws {
+        struct Boom: Error {}
+        let history = overflowingHistory()
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil,
+            generate: { _ in throw Boom() })
+
+        // Fallback produces a reduced history with NO injected summary record.
+        XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false })
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+        XCTAssertFalse(out.isEmpty)
+    }
+
+    func testAnchoredFallsBackOnEmptySummary() async throws {
+        let history = overflowingHistory()
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil,
+            generate: { _ in "   " })
+        XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false })
+    }
+
+    func testAnchoredWithoutGenerateFallsBack() async throws {
+        let history = overflowingHistory()
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+        XCTAssertFalse(out.isEmpty)
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+    }
+
+    // MARK: - Policy thresholds & seam agreement
+
+    func testShouldCompressHonorsThreshold() {
+        let policy = DefaultCompressionPolicy.extractive(threshold: 0.75, contextSize: contextSize)
+        XCTAssertTrue(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.80))
+        XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.50))
+        XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: 0, contextUtilization: 0.99),
+                       "unknown context size never compresses")
+    }
+
+    func testPreTurnAndPostTurnTriggersAgree() {
+        let threshold = 0.80
+        let policy = DefaultCompressionPolicy.truncating(threshold: threshold, contextSize: contextSize)
+
+        // Equivalent inputs: promptTokens = 0.85 * contextSize → both should fire.
+        let promptTokens = Int(0.85 * Double(contextSize))
+        let utilization = Double(promptTokens) / Double(contextSize)
+
+        let postTurn = policy.shouldCompress(
+            promptTokens: promptTokens, contextSize: contextSize, contextUtilization: utilization)
+        let preTurn = policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: promptTokens)
+        XCTAssertEqual(preTurn, postTurn)
+        XCTAssertTrue(preTurn)
+
+        // Below threshold: both decline.
+        let lowTokens = Int(0.50 * Double(contextSize))
+        XCTAssertEqual(
+            policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: lowTokens),
+            policy.shouldCompress(promptTokens: lowTokens, contextSize: contextSize,
+                                  contextUtilization: Double(lowTokens) / Double(contextSize)))
+    }
+
+    func testPreTurnWithoutPriorTokensDoesNotCompress() {
+        let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize)
+        XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 500, lastPromptTokens: nil))
+    }
+
+    func testPolicyCompressDelegatesToStrategy() async throws {
+        let history = overflowingHistory()
+        let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize)
+        let out = try await policy.compress(history: history, sessionID: sessionID, generate: Self.echoGenerate)
+        XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false })
+
+        // compressBeforeTurn shares the same path.
+        let preOut = try await policy.compressBeforeTurn(history: history, sessionID: sessionID, generate: Self.echoGenerate)
+        XCTAssertEqual(preOut.first?.kind.rawStorage, out.first?.kind.rawStorage)
+    }
+}

From 477830baa56c444fc4a9e82605eb0275eef703e9 Mon Sep 17 00:00:00 2001
From: Rory Ford <me@roryford.com>
Date: Mon, 15 Jun 2026 20:45:30 +1000
Subject: [PATCH 2/3] fix(runtime): review fixes for compression policies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a truncating-strategy test that guards the never-drop-newest
invariant under load-bearing-overflow — the existing tests passed even
with that line removed (the greedy backward fill already keeps the
newest in the common case), so the invariant was effectively untested.
Sabotage-verified.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../DefaultCompressionPolicyTests.swift       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
index 31f603cc..cc9aaf3f 100644
--- a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
+++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
@@ -57,6 +57,25 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         XCTAssertLessThanOrEqual(tokens(out), budget())
     }
 
+    /// When load-bearing records alone exhaust the budget, the greedy backward
+    /// fill cannot admit the newest chat message — only the explicit
+    /// never-drop-newest invariant keeps it. Guards that invariant directly
+    /// (the over-budget-tail path the other truncating tests don't exercise).
+    func testTruncatingKeepsNewestEvenWhenLoadBearingExceedsBudget() async throws {
+        let history = [
+            msg(.system, words: 4_000, kind: .chat),   // load-bearing, alone over budget
+            msg(.user, words: 5),
+            msg(.assistant, words: 5)                    // newest, tiny
+        ]
+        XCTAssertGreaterThan(tokens([history[0]]), budget(), "precondition: load-bearing alone overflows")
+
+        let out = try await TruncatingCompressionStrategy().compress(
+            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+
+        XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive even when load-bearing fills the budget")
+        XCTAssertTrue(out.contains { $0.role == .system }, "load-bearing record retained")
+    }
+
     func testTruncatingPreservesLoadBearingRecords() async throws {
         var history = [msg(.system, words: 10, kind: .chat)]            // system role
         history.append(msg(.assistant, words: 10, kind: .memory("summary")))  // memory kind

From 30116489488d7b803e07fc394dd2615b2d71f98c Mon Sep 17 00:00:00 2001
From: Rory Ford <me@roryford.com>
Date: Mon, 15 Jun 2026 21:24:22 +1000
Subject: [PATCH 3/3] fix(runtime): address review findings on compression
 policies

Budget realism (configurable reservedTokens replacing the bare 512, injectable
tokenizer on the factories, skip-on-tiny-window guard), extractive verbatim-core
overflow clamp, thinking-model robustness (strip <think> before summary parse,
configurable response reserve), multimodal/tool per-part token accounting in
ContextWindowManager, plus the QA test set (asymmetry boundary, chunk-and-fold,
summary-floor, cancellation, tightened weak assertions) and doc fixes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../Services/ContextWindowManager.swift       |  59 ++-
 .../Protocols/CompressionPolicy.swift         |  11 +
 .../AnchoredCompressionStrategy.swift         |  52 ++-
 .../Compression/CompressionStrategy.swift     |  59 ++-
 .../DefaultCompressionPolicy.swift            | 125 +++++-
 .../ExtractiveCompressionStrategy.swift       |  46 ++-
 .../TruncatingCompressionStrategy.swift       |   3 +-
 .../ContextWindowManagerTests.swift           |  62 +++
 .../DefaultCompressionPolicyTests.swift       | 363 ++++++++++++++++--
 9 files changed, 691 insertions(+), 89 deletions(-)

diff --git a/Sources/ManifoldInference/Services/ContextWindowManager.swift b/Sources/ManifoldInference/Services/ContextWindowManager.swift
index c70aa298..c04a09e6 100644
--- a/Sources/ManifoldInference/Services/ContextWindowManager.swift
+++ b/Sources/ManifoldInference/Services/ContextWindowManager.swift
@@ -71,6 +71,60 @@ public enum ContextWindowManager {
         return HeuristicTokenizer().tokenCount(text)
     }
 
+    /// Fixed per-part token cost for a non-text modality whose true token
+    /// footprint isn't visible from the `[ChatMessage]` value alone.
+    ///
+    /// `.content` only exposes `.text` parts, so an image/audio part would
+    /// otherwise estimate as **zero** — a multimodal turn could then sail past
+    /// the compression trigger and overflow at generation. These are coarse
+    /// placeholders, not provider-accurate counts (a single image is ~85–1.5k
+    /// tokens depending on resolution / provider), deliberately sized to be
+    /// "clearly non-zero, never catastrophically wrong": they make the budget
+    /// math notice the part exists. A backend that knows its real tiling cost
+    /// should account for that at the wire layer, not here.
+    static let imagePartTokenEstimate = 768
+    static let audioPartTokenEstimate = 384
+    static let generatedMediaPartTokenEstimate = 256
+
+    /// Estimates the token cost of a single `MessagePart`.
+    ///
+    /// - `.text` / `.thinking`: counted via the tokenizer (heuristic when nil).
+    /// - `.image` / `.audio` / `.generatedMedia`: a documented fixed estimate
+    ///   (``imagePartTokenEstimate`` etc.) since the byte payload's true token
+    ///   footprint isn't derivable here.
+    /// - `.toolCall`: the tool name plus its serialized JSON arguments.
+    /// - `.toolResult`: the serialized result payload (content + dialog).
+    public static func estimateTokenCount(_ part: MessagePart, tokenizer: TokenizerProvider? = nil) -> Int {
+        switch part {
+        case .text(let t):
+            return estimateTokenCount(t, tokenizer: tokenizer)
+        case .thinking(let t, _):
+            return estimateTokenCount(t, tokenizer: tokenizer)
+        case .image:
+            return imagePartTokenEstimate
+        case .audio:
+            return audioPartTokenEstimate
+        case .generatedMedia:
+            return generatedMediaPartTokenEstimate
+        case .toolCall(let call):
+            return estimateTokenCount("\(call.toolName) \(call.arguments)", tokenizer: tokenizer)
+        case .toolResult(let result):
+            let payload = [result.content, result.dialog].compactMap { $0 }.joined(separator: " ")
+            return estimateTokenCount(payload, tokenizer: tokenizer)
+        }
+    }
+
+    /// Estimates the token cost of a whole message by summing every content
+    /// part — text, reasoning, multimodal, and tool parts alike.
+    ///
+    /// Prefer this over `estimateTokenCount(message.content, …)`: `.content`
+    /// discards everything but `.text` parts, so the string overload silently
+    /// under-counts (to zero) image/audio/tool-only messages and lets them
+    /// overflow the window. Sums across `contentParts` instead.
+    public static func estimateTokenCount(_ message: ChatMessage, tokenizer: TokenizerProvider? = nil) -> Int {
+        message.contentParts.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) }
+    }
+
     /// Resolves the effective context size from available sources.
     ///
     /// Priority: session override > model metadata > backend capabilities > default.
@@ -121,7 +175,8 @@ public enum ContextWindowManager {
         var usedTokens = 0
 
         for i in stride(from: messages.count - 1, through: 0, by: -1) {
-            let messageTokens = estimateTokenCount(messages[i].content, tokenizer: tokenizer)
+            // Sum across all parts — `.content` would miss image/audio/tool parts.
+            let messageTokens = estimateTokenCount(messages[i], tokenizer: tokenizer)
             if usedTokens + messageTokens > available && firstKeptIndex < messages.endIndex {
                 break
             }
@@ -141,7 +196,7 @@ public enum ContextWindowManager {
         tokenizer: TokenizerProvider? = nil
     ) -> ContextBudget {
         let systemTokens = estimateTokenCount(systemPrompt ?? "", tokenizer: tokenizer)
-        let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1.content, tokenizer: tokenizer) }
+        let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) }
         let availableForHistory = maxTokens - systemTokens - responseBuffer
 
         return ContextBudget(
diff --git a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift
index cdd303c6..314a9212 100644
--- a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift
+++ b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift
@@ -11,6 +11,17 @@ import ManifoldInference
 /// Compression failures are logged and do not abort the turn — the existing
 /// history is preserved.
 ///
+/// ## Per-message pins are not yet threaded through this seam
+///
+/// `compress(history:sessionID:generate:)` passes only a `[ChatMessage]` and
+/// the `sessionID` — **not** the set of user-pinned message IDs. The data
+/// already exists (`ChatSession.pinnedMessageIDsRaw` / `pinnedMessageIDs` on
+/// the session record), but honoring pins inside a policy requires a
+/// protocol-signature change to carry the pinned-ID set (a new parameter or a
+/// session handle). Until that lands, ``DefaultCompressionPolicy`` treats only
+/// `.system`-role and `.memory`-kind records as load-bearing; explicit
+/// per-message pins are not preserved across compression.
+///
 /// ## v0.26.0 Migration
 ///
 /// The `shouldCompress` signature gained a `contextUtilization` parameter in v0.26.0.
diff --git a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
index 9aef9270..470232ec 100644
--- a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
+++ b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift
@@ -11,6 +11,16 @@ import ManifoldInference
 /// chunk-and-fold for over-window input, a minimum-summary floor, and a
 /// cancellation early-return. The summary prompt is passed to `generate` as a
 /// single-message mini-conversation.
+///
+// TODO(#1885, optional P2): collapse/fold a prior `.memory("summary")` record
+// into the new summarisation pass instead of pinning it (as load-bearing) into
+// the tail and prepending a SECOND inline summary each cycle. Across many
+// compression cycles the inline summary blocks stack. Doing this right means
+// detecting the prior summary, excluding it from the verbatim tail, and
+// feeding its text into the summariser input alongside the old messages — a
+// non-trivial change to the tail/old-message partition. Deliberately deferred
+// to keep this fix-round diff bounded; the stack is bounded in practice by the
+// budget enforcement and is correctness-neutral (just less compact).
 struct AnchoredCompressionStrategy: CompressionStrategy {
     let name = "anchored"
 
@@ -23,7 +33,10 @@ struct AnchoredCompressionStrategy: CompressionStrategy {
     /// Minimum tokens reserved for the output brief even when `contextSize` is
     /// tiny — a short brief beats no brief.
     let minSummaryBudget: Int
-    /// Tokens reserved for the summariser's own response when sizing input.
+    /// Tokens reserved for the summariser's own response when sizing the INPUT
+    /// window. This is the SAME reservation knob as the policy's
+    /// `reservedTokens` (the factory threads `reservedTokens` here), so there is
+    /// one source of truth — there is no longer a separate hard-coded buffer.
     let summarizerResponseBuffer: Int
     let summaryTemplate: String
 
@@ -44,9 +57,9 @@ struct AnchoredCompressionStrategy: CompressionStrategy {
 
     init(
         tailBudgetFraction: Double = 0.50,
+        summarizerResponseBuffer: Int = DefaultCompressionPolicy.defaultReservedTokens,
         summarizerInputWindow: Int? = nil,
         minSummaryBudget: Int = 256,
-        summarizerResponseBuffer: Int = 512,
         summaryTemplate: String? = nil
     ) {
         self.tailBudgetFraction = tailBudgetFraction
@@ -59,12 +72,13 @@ struct AnchoredCompressionStrategy: CompressionStrategy {
     func compress(
         history: [ChatMessage],
         contextSize: Int,
+        reservedTokens: Int,
         tokenizer: (any TokenizerProvider)?,
         generate: @Sendable ([ChatMessage]) async throws -> String
     ) async throws -> [ChatMessage] {
         guard !history.isEmpty else { return [] }
 
-        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens)
         let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) }
         let originalTokens = tokens.reduce(0, +)
         if originalTokens <= budget {
@@ -140,14 +154,14 @@ struct AnchoredCompressionStrategy: CompressionStrategy {
             Log.inference.debug("[AnchoredCompression] summarisation failed: \(error); falling back to extractive")
             return try await fallback.compress(
                 history: history, contextSize: contextSize,
-                tokenizer: tokenizer, generate: generate
+                reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate
             )
         }
 
         guard !summaryText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
             return try await fallback.compress(
                 history: history, contextSize: contextSize,
-                tokenizer: tokenizer, generate: generate
+                reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate
             )
         }
 
@@ -183,9 +197,33 @@ struct AnchoredCompressionStrategy: CompressionStrategy {
         ChatMessage(role: .system, content: text, sessionID: sessionID, kind: .memory("summary"))
     }
 
+    /// Strips leaked chain-of-thought (`<think>…</think>`, `<thinking>…`)
+    /// before parsing, reusing MK's ``ThinkingTransform`` rather than
+    /// hand-rolling marker logic. A reasoning model can emit its scratchpad
+    /// ahead of the brief; left in, that scratchpad would be parsed as the
+    /// "summary" and injected verbatim into the compressed history. We run the
+    /// two common marker families (Qwen/DeepSeek `<think>`, Mistral/Sky-T1
+    /// `<thinking>`) in sequence and keep only the visible `.token` text.
+    private func stripThinking(_ text: String) -> String {
+        var result = text
+        for markers in [ThinkingMarkers.qwen3, ThinkingMarkers.mistralReasoning] {
+            var transform = ThinkingTransform(markers: markers)
+            var events = transform.process([.token(result)])
+            events += transform.finalize()
+            let visible = events.compactMap { event -> String? in
+                if case .token(let t) = event { return t }
+                return nil
+            }.joined()
+            result = visible
+        }
+        return result
+    }
+
     /// Extracts `FIELD: value` lines and reassembles them; falls back to a
-    /// trimmed raw response when fewer than two fields are present.
-    private func parseSummaryResponse(_ response: String) -> String {
+    /// trimmed raw response when fewer than two fields are present. Strips
+    /// leaked reasoning first so chain-of-thought can't masquerade as a summary.
+    private func parseSummaryResponse(_ rawResponse: String) -> String {
+        let response = stripThinking(rawResponse)
         let pattern = "^([A-Z][A-Z _]*[A-Z]):\\s*(.+)$"
         let regex: NSRegularExpression
         do {
diff --git a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
index 47cc80c0..ab99e8a3 100644
--- a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
+++ b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift
@@ -13,13 +13,19 @@ import ManifoldInference
 /// those as configuration and forwards them, because the protocol `compress`
 /// signatures intentionally do not.
 ///
-/// ## System-prompt budgeting
+/// ## System-prompt budgeting (seam constraint)
 ///
 /// A strategy sees only the `history` array. The session system prompt lives
-/// on `ChatSession.systemPrompt` and is *not* part of `history`, so the budget
-/// is sized against `contextSize` minus a fixed response buffer. Any `.system`
-/// role or `.memory` kind records that *are* in `history` are treated as
-/// load-bearing and preserved verbatim by every strategy.
+/// on `ChatSession.systemPrompt` and is *not* part of `history`, and the
+/// `CompressionPolicy` / `PreTurnCompressionPolicy` protocol `compress`
+/// signatures pass **neither** a system prompt nor a tokenizer — so a strategy
+/// cannot subtract a *real* system-prompt token count and cannot read a live
+/// tokenizer. Both are therefore **configuration** on ``DefaultCompressionPolicy``
+/// (set via the factories) and forwarded in: `reservedTokens` (response
+/// headroom + a system-prompt allowance) and an optional `tokenizer`. The
+/// budget is `contextSize − reservedTokens`. Any `.system`-role or `.memory`-kind
+/// records that *are* in `history` are treated as load-bearing and preserved
+/// verbatim by every strategy.
 ///
 /// Strategies are `Sendable` value types: `generate` is a parameter, never
 /// stored, so there is no mutable summariser handle to guard.
@@ -32,7 +38,13 @@ protocol CompressionStrategy: Sendable {
     /// - Parameters:
     ///   - history: Full message history, oldest-first.
     ///   - contextSize: Backend context window in tokens.
-    ///   - tokenizer: Optional tokenizer for cost estimation; heuristic when `nil`.
+    ///   - reservedTokens: Tokens carved out of `contextSize` before history is
+    ///     sized — response headroom **plus** a system-prompt allowance (the
+    ///     real prompt isn't visible here). Single source of truth for the
+    ///     reservation across all strategies.
+    ///   - tokenizer: Optional tokenizer for cost estimation; heuristic
+    ///     (chars/4) when `nil`, in which case the budget is **advisory**, not
+    ///     guaranteed to match the backend's real token count.
     ///   - generate: Inference call for strategies that summarise. Zero-inference
     ///     strategies ignore it. A closure that yields empty text signals "no
     ///     usable summariser" — summarising strategies fall back accordingly.
@@ -41,6 +53,7 @@ protocol CompressionStrategy: Sendable {
     func compress(
         history: [ChatMessage],
         contextSize: Int,
+        reservedTokens: Int,
         tokenizer: (any TokenizerProvider)?,
         generate: @Sendable ([ChatMessage]) async throws -> String
     ) async throws -> [ChatMessage]
@@ -49,12 +62,11 @@ protocol CompressionStrategy: Sendable {
 // MARK: - Shared helpers
 
 extension CompressionStrategy {
-    /// Tokens reserved for the model's own response when sizing history.
-    var responseBuffer: Int { 512 }
-
-    /// Per-message token estimate.
+    /// Per-message token estimate — sums across ALL content parts (text,
+    /// reasoning, image/audio, tool-call/-result), not just `.text`, so a
+    /// multimodal or tool-only message isn't counted as zero (#1885 finding 9).
     func estimateTokens(_ message: ChatMessage, tokenizer: (any TokenizerProvider)?) -> Int {
-        ContextWindowManager.estimateTokenCount(message.content, tokenizer: tokenizer)
+        ContextWindowManager.estimateTokenCount(message, tokenizer: tokenizer)
     }
 
     /// Total tokens across a message set.
@@ -62,20 +74,29 @@ extension CompressionStrategy {
         messages.reduce(0) { $0 + estimateTokens($1, tokenizer: tokenizer) }
     }
 
-    /// Token budget available for history: context window minus the response
-    /// buffer. The session system prompt is not visible here (it is not part
-    /// of `history`), so it is not subtracted.
-    func historyBudget(contextSize: Int, tokenizer: (any TokenizerProvider)?) -> Int {
-        max(0, contextSize - responseBuffer)
+    /// Token budget available for history: context window minus `reservedTokens`
+    /// (response headroom + system-prompt allowance). The session system prompt
+    /// is not visible here, so its real cost can't be subtracted — the
+    /// allowance baked into `reservedTokens` covers it. Returns `0` when the
+    /// reservation meets or exceeds the window; callers must guard that case
+    /// (`reservedTokens >= contextSize`) and skip compression rather than churn
+    /// against a zero budget.
+    func historyBudget(contextSize: Int, reservedTokens: Int) -> Int {
+        max(0, contextSize - reservedTokens)
     }
 
     /// `true` for records that must survive compression regardless of budget:
     /// `.system`-role prompt fragments and `.memory` summarisation artifacts
     /// (a prior compression brief must not be evicted by a later pass).
     ///
-    // TODO: per-message pinning is not visible to a `[ChatMessage]`-only
-    // policy — `ChatSession.pinnedMessageIDs` lives on the session. When the
-    // seam grows a pinned-IDs channel, honor it here.
+    // TODO(#1885): honor per-message pins. The data already exists —
+    // `ChatSession.pinnedMessageIDs` holds the pinned IDs — but the
+    // `CompressionPolicy.compress(history:sessionID:generate:)` /
+    // `PreTurnCompressionPolicy.compressBeforeTurn(...)` signatures pass only a
+    // `[ChatMessage]` and the `sessionID`, not the pinned-ID set. Threading the
+    // pins through is a PROTOCOL-SIGNATURE change (a new parameter or a session
+    // handle), so it is deliberately out of scope for this PR; see the note on
+    // the `CompressionPolicy` protocol doc.
     func isLoadBearing(_ message: ChatMessage) -> Bool {
         if message.role == .system { return true }
         if case .memory = message.kind { return true }
diff --git a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
index a295a8ac..1883771f 100644
--- a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
+++ b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift
@@ -7,27 +7,56 @@ import ManifoldInference
 /// (pre-turn) — so one value drives either, or both, without a bespoke
 /// conformance.
 ///
-/// Construct via the strategy factories rather than the initializer:
+/// Construct via the strategy factories rather than the initializer, and inject
+/// the policy through ``ConversationRuntimeOptions`` (or the matching
+/// `ConversationRuntime` init parameter) — there is no mutable
+/// `runtime.compressionPolicy` property:
 ///
 /// ```swift
 /// // Zero-inference, smart selection. Compress at 75% context utilisation.
-/// runtime.compressionPolicy = .extractive(threshold: 0.75, contextSize: 8_192)
-///
-/// // Inference-backed summary of old turns, kept-recent tail. Pre-turn seam.
-/// runtime.preTurnCompressionPolicy = .anchored(threshold: 0.85, contextSize: 8_192)
+/// let options = ConversationRuntimeOptions(
+///     compressionPolicy: .extractive(threshold: 0.75, contextSize: 8_192)
+/// )
 ///
 /// // Cheapest baseline: drop oldest, keep system/summary records + newest.
-/// runtime.compressionPolicy = .truncating(threshold: 0.90, contextSize: 4_096)
+/// let truncating = ConversationRuntimeOptions(
+///     compressionPolicy: .truncating(threshold: 0.90, contextSize: 4_096)
+/// )
+///
+/// // Inference-backed summary of old turns, kept-recent tail. Prefer the
+/// // POST-turn seam: anchored runs a full summariser round-trip, and on the
+/// // pre-turn seam that latency lands before the user's message even renders.
+/// let anchored = ConversationRuntimeOptions(
+///     compressionPolicy: .anchored(threshold: 0.85, contextSize: 8_192)
+/// )
+///
+/// // For the PRE-turn seam, prefer a zero-inference strategy so turn setup
+/// // pays no summariser latency:
+/// let preTurn = ConversationRuntimeOptions(
+///     preTurnCompressionPolicy: .extractive(threshold: 0.85, contextSize: 8_192)
+/// )
 /// ```
 ///
+/// ## Budget realism (seam constraint)
+///
+/// The protocol `compress` signatures pass neither a system prompt nor a
+/// tokenizer, so this policy holds both as configuration. `reservedTokens`
+/// carves response headroom **plus** a system-prompt allowance out of
+/// `contextSize`; the history budget is `contextSize − reservedTokens`. Pass a
+/// real `tokenizer:` to the factories for a guaranteed budget — with
+/// `tokenizer: nil` the budget is **advisory** (a chars/4 heuristic that can
+/// diverge from the backend's real token count that drives the trigger).
+///
 /// ## Trigger asymmetry
 ///
 /// ``CompressionPolicy/shouldCompress(promptTokens:contextSize:contextUtilization:)``
 /// receives utilisation directly. ``PreTurnCompressionPolicy`` does not — its
 /// `shouldCompressBeforeTurn` sees only `messageCount` and `lastPromptTokens`,
 /// so this policy stores `contextSize` and computes utilisation from
-/// `lastPromptTokens / contextSize`. Given equivalent inputs the two seams
-/// agree.
+/// `lastPromptTokens / contextSize`. The two can disagree at the boundary by a
+/// rounding margin: a post-turn caller that hands in an already-rounded
+/// utilisation may cross the threshold while the pre-turn recompute from raw
+/// `lastPromptTokens` stays just below it.
 public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPolicy {
     private let strategy: any CompressionStrategy
 
@@ -36,17 +65,49 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol
     /// Backend context window (tokens). Held as configuration because the
     /// pre-turn seam does not pass it and the strategy needs it to size budget.
     public let contextSize: Int
+    /// Tokens reserved out of `contextSize` before history is sized: response
+    /// headroom + a system-prompt allowance (the real system prompt is not
+    /// visible to the strategy). Single source of truth for the reservation.
+    public let reservedTokens: Int
     private let tokenizer: (any TokenizerProvider)?
 
+    /// Default reservation when a caller doesn't override it.
+    ///
+    /// The legacy value was a bare `512`, which only covered a short response
+    /// and left **nothing** for the session system prompt (which is not part of
+    /// `history` and so can't be subtracted directly here). `2048` reserves a
+    /// generous response headroom plus a system-prompt allowance: it matches
+    /// the `maxOutputTokens ?? 2048` default that `GenerationQueue` /
+    /// `PromptAssembler` already reserve at the wire layer, so the trigger and
+    /// the budget agree on roughly the same headroom. Reasoning models that
+    /// emit thousands of thinking tokens should raise this further (see
+    /// ``scaledReservedTokens(forContextSize:base:)``) — a too-small reserve is
+    /// the classic thinking-model overflow.
+    public static let defaultReservedTokens = 2_048
+
+    /// A context-scaled reservation: never below `base`, and at least ~12.5% of
+    /// the window so large contexts leave proportional response/thinking
+    /// headroom. Capped at half the window so a tiny `base` can't starve
+    /// history on small contexts. Use this for reasoning models.
+    public static func scaledReservedTokens(
+        forContextSize contextSize: Int,
+        base: Int = defaultReservedTokens
+    ) -> Int {
+        let scaled = max(base, contextSize / 8)
+        return min(scaled, max(base, contextSize / 2))
+    }
+
     init(
         strategy: any CompressionStrategy,
         threshold: Double,
         contextSize: Int,
+        reservedTokens: Int,
         tokenizer: (any TokenizerProvider)?
     ) {
         self.strategy = strategy
         self.threshold = threshold
         self.contextSize = contextSize
+        self.reservedTokens = reservedTokens
         self.tokenizer = tokenizer
     }
 
@@ -54,14 +115,22 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol
 
     /// Zero-inference sliding window: keep system/summary records + the newest
     /// messages that fit, drop the oldest.
+    ///
+    /// - Parameters:
+    ///   - reservedTokens: Tokens carved from `contextSize` for response +
+    ///     system-prompt headroom (default ``defaultReservedTokens``).
+    ///   - tokenizer: Inject the backend's tokenizer for a guaranteed budget;
+    ///     `nil` (default) makes the budget advisory (chars/4 heuristic).
     public static func truncating(
         threshold: Double = 0.90,
         contextSize: Int,
+        reservedTokens: Int = defaultReservedTokens,
         tokenizer: (any TokenizerProvider)? = nil
     ) -> DefaultCompressionPolicy {
         DefaultCompressionPolicy(
             strategy: TruncatingCompressionStrategy(),
-            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+            threshold: threshold, contextSize: contextSize,
+            reservedTokens: reservedTokens, tokenizer: tokenizer
         )
     }
 
@@ -72,33 +141,47 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol
         threshold: Double = 0.75,
         headBudgetFraction: Double = 0.0,
         contextSize: Int,
+        reservedTokens: Int = defaultReservedTokens,
         tokenizer: (any TokenizerProvider)? = nil
     ) -> DefaultCompressionPolicy {
         DefaultCompressionPolicy(
             strategy: ExtractiveCompressionStrategy(headBudgetFraction: headBudgetFraction),
-            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+            threshold: threshold, contextSize: contextSize,
+            reservedTokens: reservedTokens, tokenizer: tokenizer
         )
     }
 
     /// Inference-backed summary of old turns prepended to a verbatim recent
     /// tail. Falls back to extractive when no summary can be produced.
     ///
-    /// - Parameter summarizerInputWindow: the summariser's REAL window, used to
-    ///   size how much old text it reads — set this to the backend's true
-    ///   context size when `contextSize` is a small overflow trigger.
+    /// Prefer the **post-turn** seam for anchored: it runs a full summariser
+    /// round-trip, and on the pre-turn seam that latency is paid before the
+    /// user's just-typed message renders.
+    ///
+    /// - Parameters:
+    ///   - summarizerInputWindow: the summariser's REAL window, used to size how
+    ///     much old text it reads — set this to the backend's true context size
+    ///     when `contextSize` is a small overflow trigger.
+    ///   - summaryTemplate: custom prompt. Note the coupling with
+    ///     `parseSummaryResponse`: a custom template should emit
+    ///     `UPPERCASE-FIELD: value` lines (≥2) or the parser degrades to a
+    ///     raw-truncated brief. `{old_text}` is the substitution placeholder.
     public static func anchored(
         threshold: Double = 0.85,
         contextSize: Int,
+        reservedTokens: Int = defaultReservedTokens,
         summarizerInputWindow: Int? = nil,
         summaryTemplate: String? = nil,
         tokenizer: (any TokenizerProvider)? = nil
     ) -> DefaultCompressionPolicy {
         DefaultCompressionPolicy(
             strategy: AnchoredCompressionStrategy(
+                summarizerResponseBuffer: reservedTokens,
                 summarizerInputWindow: summarizerInputWindow,
                 summaryTemplate: summaryTemplate
             ),
-            threshold: threshold, contextSize: contextSize, tokenizer: tokenizer
+            threshold: threshold, contextSize: contextSize,
+            reservedTokens: reservedTokens, tokenizer: tokenizer
         )
     }
 
@@ -113,9 +196,19 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol
         sessionID: UUID,
         generate: @Sendable ([ChatMessage]) async throws -> String
     ) async throws -> [ChatMessage] {
-        try await strategy.compress(
+        // Guard the degenerate window: if the reservation meets or exceeds the
+        // context the history budget is zero, and every pass would report
+        // "over budget" forever (the 512-token simulator cap is the canonical
+        // trap). Skip rather than churn.
+        guard contextSize > reservedTokens else {
+            Log.inference.warning(
+                "[Compression] contextSize \(contextSize) <= reservedTokens \(reservedTokens); skipping compression (no usable history budget)"
+            )
+            return history
+        }
+        return try await strategy.compress(
             history: history, contextSize: contextSize,
-            tokenizer: tokenizer, generate: generate
+            reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate
         )
     }
 
diff --git a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
index 8332a5c0..327fdc98 100644
--- a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
+++ b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift
@@ -20,8 +20,20 @@ struct ExtractiveCompressionStrategy: CompressionStrategy {
     let headBudgetFraction: Double
     let recencyWeight: Double
     let lengthWeight: Double
+    /// Weight for capitalized-word density. NOTE: this signal assumes
+    /// English-like prose where proper nouns and sentence starts are
+    /// capitalized. It degrades on all-lowercase text, source code, and
+    /// non-cased scripts (CJK), where density trends to ~0 and the term simply
+    /// drops out. It is the smallest weight (0.2) precisely so it only *nudges*
+    /// selection rather than dominating it.
     let keywordDensityWeight: Double
 
+    /// Combined ceiling for the verbatim head + tail fractions. Past this the
+    /// pinned-verbatim core could equal or exceed the whole budget, leaving no
+    /// room for scored selection and risking an over-budget result before
+    /// scoring even runs.
+    static let maxVerbatimCoreFraction = 0.8
+
     init(
         tailBudgetFraction: Double = 0.40,
         headBudgetFraction: Double = 0.0,
@@ -39,12 +51,13 @@ struct ExtractiveCompressionStrategy: CompressionStrategy {
     func compress(
         history: [ChatMessage],
         contextSize: Int,
+        reservedTokens: Int,
         tokenizer: (any TokenizerProvider)?,
         generate: @Sendable ([ChatMessage]) async throws -> String
     ) async throws -> [ChatMessage] {
         guard !history.isEmpty else { return [] }
 
-        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens)
         let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) }
         let originalTokens = tokens.reduce(0, +)
 
@@ -63,8 +76,16 @@ struct ExtractiveCompressionStrategy: CompressionStrategy {
             used += tokens[i]
         }
 
+        // Clamp the verbatim core (head + tail) so the two reserved bands can't
+        // jointly claim the whole budget and leave nothing for scored selection
+        // (or overflow it before scoring runs). Tail keeps priority; head takes
+        // whatever remains under the ceiling.
+        let coreFraction = min(Self.maxVerbatimCoreFraction, tailBudgetFraction + headBudgetFraction)
+        let effectiveTailFraction = min(tailBudgetFraction, coreFraction)
+        let effectiveHeadFraction = max(0.0, coreFraction - effectiveTailFraction)
+
         // --- Verbatim tail (newest) ---
-        let tailBudget = Int(Double(budget) * tailBudgetFraction)
+        let tailBudget = Int(Double(budget) * effectiveTailFraction)
         var tailUsed = 0
         for i in stride(from: count - 1, through: 0, by: -1) {
             if keep.contains(i) { continue }
@@ -82,8 +103,8 @@ struct ExtractiveCompressionStrategy: CompressionStrategy {
         }
 
         // --- Verbatim head (oldest) — anti "lost in the middle" ---
-        if headBudgetFraction > 0 {
-            let headBudget = Int(Double(budget) * headBudgetFraction)
+        if effectiveHeadFraction > 0 {
+            let headBudget = Int(Double(budget) * effectiveHeadFraction)
             var headUsed = 0
             for i in 0..<count {
                 if keep.contains(i) { continue }
@@ -115,6 +136,23 @@ struct ExtractiveCompressionStrategy: CompressionStrategy {
             }
         }
 
+        // Final budget enforcement: the verbatim tail/head admission can push
+        // the union over budget when those bands admit large messages (each
+        // band only checks its own sub-budget). Evict kept non-load-bearing
+        // messages — oldest first, but never the newest — until the union fits.
+        // Load-bearing records are never evicted (they survive regardless of
+        // budget by contract).
+        if used > budget {
+            let newest = count - 1
+            for i in 0..<count where keep.contains(i) {
+                if used <= budget { break }
+                if i == newest { continue }
+                if isLoadBearing(history[i]) { continue }
+                keep.remove(i)
+                used -= tokens[i]
+            }
+        }
+
         return keep.sorted().map { history[$0] }
     }
 
diff --git a/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
index 467d4cf5..4af0cc0d 100644
--- a/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
+++ b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift
@@ -11,12 +11,13 @@ struct TruncatingCompressionStrategy: CompressionStrategy {
     func compress(
         history: [ChatMessage],
         contextSize: Int,
+        reservedTokens: Int,
         tokenizer: (any TokenizerProvider)?,
         generate: @Sendable ([ChatMessage]) async throws -> String
     ) async throws -> [ChatMessage] {
         guard !history.isEmpty else { return [] }
 
-        let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer)
+        let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens)
         if estimateTokens(history, tokenizer: tokenizer) <= budget {
             return history
         }
diff --git a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift
index f23321ca..216ecac1 100644
--- a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift
+++ b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift
@@ -32,6 +32,68 @@ final class ContextWindowManagerTests: XCTestCase {
         XCTAssertEqual(ContextWindowManager.estimateTokenCount("a"), 1)
     }
 
+    // MARK: - Per-part / per-message estimation (#1885 finding 9)
+
+    func test_estimateTokenCount_textPart_matchesStringOverload() {
+        let text = String(repeating: "a", count: 100)  // 25 tokens
+        XCTAssertEqual(ContextWindowManager.estimateTokenCount(MessagePart.text(text)),
+                       ContextWindowManager.estimateTokenCount(text))
+    }
+
+    func test_estimateTokenCount_imagePart_isNonZeroFixed() {
+        let part = MessagePart.image(data: Data(repeating: 0, count: 8), mimeType: "image/png")
+        XCTAssertEqual(ContextWindowManager.estimateTokenCount(part),
+                       ContextWindowManager.imagePartTokenEstimate)
+        XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0,
+                             "image must not estimate as zero")
+    }
+
+    func test_estimateTokenCount_audioPart_isNonZeroFixed() {
+        let part = MessagePart.audio(url: URL(fileURLWithPath: "/tmp/a.m4a"), duration: 3, waveform: nil)
+        XCTAssertEqual(ContextWindowManager.estimateTokenCount(part),
+                       ContextWindowManager.audioPartTokenEstimate)
+    }
+
+    func test_estimateTokenCount_toolCallPart_countsNameAndArgs() {
+        let part = MessagePart.toolCall(ToolCall(
+            id: "c1", toolName: "get_weather",
+            arguments: String(repeating: "x", count: 80)))
+        XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0,
+                             "tool call args must be counted")
+    }
+
+    func test_estimateTokenCount_toolResultPart_countsPayload() {
+        let part = MessagePart.toolResult(ToolResult(
+            callId: "c1", content: String(repeating: "y", count: 120)))
+        XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0)
+    }
+
+    /// The crux: a message with ONLY non-text parts used to estimate as zero
+    /// via `.content`. The per-message estimate must be non-zero.
+    func test_estimateTokenCount_imageOnlyMessage_isNonZero() {
+        let msg = ChatMessage(
+            role: .user,
+            contentParts: [.image(data: Data(repeating: 0, count: 8), mimeType: "image/png")],
+            sessionID: UUID())
+        XCTAssertEqual(msg.content, "", "precondition: .content is empty for image-only message")
+        XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(msg), 0,
+                             "image-only message must not estimate as zero")
+    }
+
+    func test_estimateTokenCount_message_sumsAcrossParts() {
+        let text = String(repeating: "a", count: 100)  // 25 tokens
+        let msg = ChatMessage(
+            role: .user,
+            contentParts: [
+                .text(text),
+                .image(data: Data(repeating: 0, count: 8), mimeType: "image/png"),
+            ],
+            sessionID: UUID())
+        let expected = ContextWindowManager.estimateTokenCount(text)
+            + ContextWindowManager.imagePartTokenEstimate
+        XCTAssertEqual(ContextWindowManager.estimateTokenCount(msg), expected)
+    }
+
     // MARK: - Context Size Resolution
 
     func test_resolveContextSize_sessionOverrideTakesPriority() {
diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
index cc9aaf3f..3ffb32d5 100644
--- a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
+++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift
@@ -23,12 +23,23 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         (0..<turns).map { msg($0.isMultiple(of: 2) ? .user : .assistant, words: words) }
     }
 
-    // Sized so the ~2.1k-token fixtures overflow the budget (≈1.5 tok/word
-    // under the heuristic tokenizer): contextSize - responseBuffer = 1_536.
+    // The strategy tests inject `reservedTokens` directly, so budget is
+    // contextSize - reservedTokens. Keep the historical 512 reserve in the
+    // unit tests so the ~2.1k-token fixtures still overflow budget = 1_536.
     private let contextSize = 2_048
-    private func budget() -> Int { max(0, contextSize - 512) }
+    private let reservedTokens = 512
+    private func budget() -> Int { max(0, contextSize - reservedTokens) }
     private func tokens(_ messages: [ChatMessage]) -> Int {
-        messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1.content, tokenizer: nil) }
+        messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: nil) }
+    }
+
+    /// A real (non-nil) tokenizer to exercise the tokenizer-injected path.
+    /// Counts whitespace-separated words — deterministic and != the chars/4
+    /// heuristic, so a test that passes it really takes the tokenizer branch.
+    private struct WordTokenizer: TokenizerProvider {
+        func tokenCount(_ text: String) -> Int {
+            max(1, text.split(whereSeparator: { $0.isWhitespace }).count)
+        }
     }
 
     private static func echoGenerate(_: [ChatMessage]) async throws -> String {
@@ -40,7 +51,8 @@ final class DefaultCompressionPolicyTests: XCTestCase {
     func testTruncatingLeavesSmallHistoryUntouched() async throws {
         let history = [msg(.user, words: 5), msg(.assistant, words: 5)]
         let out = try await TruncatingCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
         XCTAssertEqual(out.map(\.id), history.map(\.id))
     }
 
@@ -49,7 +61,8 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         XCTAssertGreaterThan(tokens(history), budget())  // precondition: actually overflows
 
         let out = try await TruncatingCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
 
         XCTAssertLessThan(out.count, history.count, "expected eviction")
         XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive")
@@ -70,7 +83,8 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         XCTAssertGreaterThan(tokens([history[0]]), budget(), "precondition: load-bearing alone overflows")
 
         let out = try await TruncatingCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
 
         XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive even when load-bearing fills the budget")
         XCTAssertTrue(out.contains { $0.role == .system }, "load-bearing record retained")
@@ -82,31 +96,51 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         history.append(contentsOf: overflowingHistory())
 
         let out = try await TruncatingCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
 
         XCTAssertTrue(out.contains { $0.role == .system }, "system prompt must survive")
         XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false },
                       "prior summary must survive")
     }
 
+    /// Real-tokenizer path: a deterministic word tokenizer (not chars/4) must
+    /// still reduce below the budget it computes.
+    func testTruncatingWithRealTokenizerReducesBelowBudget() async throws {
+        let tok = WordTokenizer()
+        let history = overflowingHistory(turns: 20, words: 200)
+        let out = try await TruncatingCompressionStrategy().compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: tok, generate: { _ in "" })
+        let usedWords = out.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: tok) }
+        XCTAssertLessThanOrEqual(usedWords, budget())
+        XCTAssertEqual(out.last?.id, history.last?.id)
+    }
+
     // MARK: - Extractive
 
     func testExtractiveReducesBelowBudgetAndKeepsNewest() async throws {
         let history = overflowingHistory()
         let out = try await ExtractiveCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
 
         XCTAssertLessThanOrEqual(tokens(out), budget())
         XCTAssertEqual(out.last?.id, history.last?.id)
-        XCTAssertEqual(out.map(\.id), out.map(\.id).sorted { a, b in
-            (history.firstIndex { $0.id == a } ?? 0) < (history.firstIndex { $0.id == b } ?? 0)
-        }, "output must stay chronological")
+        // Output must be strictly increasing in original history index — guards
+        // chronological order against re-ordering (not a self-sort tautology).
+        let indices = out.map { m in history.firstIndex { $0.id == m.id }! }
+        XCTAssertEqual(indices, indices.sorted(), "output indices must be sorted")
+        for i in 1..<indices.count {
+            XCTAssertLessThan(indices[i - 1], indices[i], "indices strictly increasing (no dupes/reorder)")
+        }
     }
 
     func testExtractiveSingleMessageNeverEvicted() async throws {
         let history = [msg(.user, words: 5_000)]  // alone but over budget
         let out = try await ExtractiveCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
         XCTAssertEqual(out.count, 1)
     }
 
@@ -115,21 +149,73 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         let oldestID = history.first!.id
 
         let withoutHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.0).compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
         let withHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.30).compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
 
         // The head knob guarantees the oldest establishing message survives.
         XCTAssertTrue(withHead.contains { $0.id == oldestID }, "head budget must retain the oldest message")
         XCTAssertFalse(withoutHead.contains { $0.id == oldestID }, "without head budget the oldest is evictable")
     }
 
+    /// Finding 2: tail (0.40) + head must not let the verbatim core exceed the
+    /// budget. Pass head 0.6 so head+tail = 1.0 (clamped to 0.8) and confirm
+    /// the result still fits.
+    func testExtractiveVerbatimCoreNeverExceedsBudget() async throws {
+        let history = overflowingHistory(turns: 24, words: 120)
+        let out = try await ExtractiveCompressionStrategy(
+            tailBudgetFraction: 0.40, headBudgetFraction: 0.60  // sums to 1.0 → clamped
+        ).compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
+        XCTAssertLessThanOrEqual(tokens(out), budget(), "verbatim core must be clamped under budget")
+        XCTAssertEqual(out.last?.id, history.last?.id, "newest always survives")
+        XCTAssertFalse(out.isEmpty)
+    }
+
+    /// `headBudgetFraction` at the 1.0 boundary: clamp keeps the union ≤ budget.
+    func testExtractiveHeadFractionAtOneBoundary() async throws {
+        let history = overflowingHistory(turns: 24, words: 120)
+        let out = try await ExtractiveCompressionStrategy(
+            tailBudgetFraction: 0.40, headBudgetFraction: 1.0
+        ).compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+        XCTAssertEqual(out.last?.id, history.last?.id)
+    }
+
+    /// All-load-bearing history: every message is `.system`, so all are pinned
+    /// and the over-budget final pass cannot evict them — result equals input.
+    func testExtractiveAllMessagesLoadBearing() async throws {
+        let history = (0..<10).map { _ in msg(.system, words: 200) }
+        XCTAssertGreaterThan(tokens(history), budget())
+        let out = try await ExtractiveCompressionStrategy().compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
+        XCTAssertEqual(Set(out.map(\.id)), Set(history.map(\.id)),
+                       "load-bearing records are never evicted even over budget")
+    }
+
+    func testExtractiveWithRealTokenizerReducesBelowBudget() async throws {
+        let tok = WordTokenizer()
+        let history = overflowingHistory(turns: 20, words: 200)
+        let out = try await ExtractiveCompressionStrategy().compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: tok, generate: { _ in "" })
+        let usedWords = out.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: tok) }
+        XCTAssertLessThanOrEqual(usedWords, budget())
+    }
+
     // MARK: - Anchored
 
     func testAnchoredPrependsMemorySummary() async throws {
         let history = overflowingHistory()
         let out = try await AnchoredCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: Self.echoGenerate)
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: Self.echoGenerate)
 
         let first = try XCTUnwrap(out.first)
         XCTAssertEqual(first.role, .system)
@@ -138,14 +224,17 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         }
         XCTAssertEqual(label, "summary")
         XCTAssertEqual(out.last?.id, history.last?.id, "verbatim tail preserved")
+        // Old messages were dropped and the result fits the budget.
+        XCTAssertLessThan(out.count, history.count, "old messages must be dropped")
+        XCTAssertLessThanOrEqual(tokens(out), budget())
     }
 
     func testAnchoredFallsBackToExtractiveWhenGenerateFails() async throws {
         struct Boom: Error {}
         let history = overflowingHistory()
         let out = try await AnchoredCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil,
-            generate: { _ in throw Boom() })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in throw Boom() })
 
         // Fallback produces a reduced history with NO injected summary record.
         XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false })
@@ -156,19 +245,169 @@ final class DefaultCompressionPolicyTests: XCTestCase {
     func testAnchoredFallsBackOnEmptySummary() async throws {
         let history = overflowingHistory()
         let out = try await AnchoredCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil,
-            generate: { _ in "   " })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "   " })
         XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false })
     }
 
+    /// Empty `generate` (no usable summariser) must fall back, NOT inject an
+    /// empty `.memory` record.
     func testAnchoredWithoutGenerateFallsBack() async throws {
         let history = overflowingHistory()
         let out = try await AnchoredCompressionStrategy().compress(
-            history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" })
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: { _ in "" })
         XCTAssertFalse(out.isEmpty)
+        XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false },
+                       "empty summariser must NOT inject a memory record")
         XCTAssertLessThanOrEqual(tokens(out), budget())
     }
 
+    /// Leaked chain-of-thought must be stripped before parsing — the
+    /// `<think>` scratchpad must not appear in the summary record.
+    func testAnchoredStripsLeakedThinkingFromSummary() async throws {
+        let history = overflowingHistory()
+        let leaky: @Sendable ([ChatMessage]) async throws -> String = { _ in
+            "<think>I should mention SECRET_LEAK while reasoning</think>\nTOPIC: testing\nKEY POINTS: a; b; c"
+        }
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: leaky)
+        let summary = try XCTUnwrap(out.first)
+        XCTAssertFalse(summary.content.contains("SECRET_LEAK"), "thinking must be stripped")
+        XCTAssertFalse(summary.content.contains("<think>"))
+        XCTAssertTrue(summary.content.contains("TOPIC"), "visible fields survive")
+    }
+
+    /// Chunk-and-fold: oldText exceeds the usable summariser input window, so
+    /// the strategy chunks. `generate` records every prompt; assert ≥2 calls
+    /// and that content from the OLDEST chunk is represented in the brief.
+    func testAnchoredChunkAndFold() async throws {
+        actor PromptRecorder {
+            var prompts: [String] = []
+            func record(_ p: String) { prompts.append(p) }
+        }
+        let recorder = PromptRecorder()
+        let history = overflowingHistory(turns: 30, words: 120)
+        // First message carries a unique marker we can trace to the oldest chunk.
+        var tagged = history
+        tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID)
+
+        let generate: @Sendable ([ChatMessage]) async throws -> String = { msgs in
+            let prompt = msgs.first?.content ?? ""
+            await recorder.record(prompt)
+            // Echo back any marker the chunk contained so it reaches the fold.
+            if prompt.contains("OLDEST_MARKER") {
+                return "TOPIC: oldest\nKEY POINTS: OLDEST_MARKER seen; b; c"
+            }
+            return "TOPIC: chunk\nKEY POINTS: a; b; c"
+        }
+
+        // summarizerInputWindow small enough that old text exceeds the usable budget.
+        let out = try await AnchoredCompressionStrategy(
+            summarizerResponseBuffer: 64, summarizerInputWindow: 600
+        ).compress(
+            history: tagged, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: generate)
+
+        let calls = await recorder.prompts
+        XCTAssertGreaterThanOrEqual(calls.count, 2, "chunking should produce ≥2 generate calls (chunks + fold)")
+        let summary = try XCTUnwrap(out.first)
+        guard case .memory = summary.kind else { return XCTFail("expected memory summary") }
+        XCTAssertTrue(summary.content.contains("OLDEST_MARKER") || calls.contains { $0.contains("OLDEST_MARKER") },
+                      "oldest chunk content must be represented")
+    }
+
+    /// Chunk-failure: the FIRST chunk's `generate` throws → that chunk's raw
+    /// content is preserved via the truncated-text fallback (not lost), and the
+    /// overall compression still produces a `.memory` summary because the fold
+    /// (a later call) succeeds.
+    func testAnchoredChunkFailurePreservesContent() async throws {
+        actor CallCounter { var n = 0; func next() -> Int { n += 1; return n } }
+        let counter = CallCounter()
+        let history = overflowingHistory(turns: 30, words: 120)
+        var tagged = history
+        tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID)
+
+        // Throw on the first generate call (a chunk), succeed on all later calls
+        // (remaining chunks + the fold). The first chunk's raw text falls back
+        // via truncateToFit so its content is not dropped.
+        let generate: @Sendable ([ChatMessage]) async throws -> String = { _ in
+            struct ChunkBoom: Error {}
+            if await counter.next() == 1 { throw ChunkBoom() }
+            return "TOPIC: chunk\nKEY POINTS: a; b; c"
+        }
+        let out = try await AnchoredCompressionStrategy(
+            summarizerResponseBuffer: 64, summarizerInputWindow: 600
+        ).compress(
+            history: tagged, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: generate)
+        // The summary record exists (top-level summarise succeeded on the fold).
+        let summary = try XCTUnwrap(out.first)
+        guard case .memory = summary.kind else { return XCTFail("expected memory summary despite chunk failure") }
+        let totalCalls = await counter.n
+        XCTAssertGreaterThanOrEqual(totalCalls, 2, "should retry remaining chunks + fold after one chunk failed")
+        XCTAssertFalse(out.isEmpty)
+    }
+
+    /// Summary-floor: the tail consumes ~the whole budget, leaving no room for
+    /// the summary. The strategy must still emit a non-empty `.memory` record
+    /// AND keep the result within budget.
+    func testAnchoredSummaryFloor() async throws {
+        // tailBudgetFraction 0.95 → tail eats almost all budget.
+        let history = overflowingHistory(turns: 20, words: 120)
+        let out = try await AnchoredCompressionStrategy(tailBudgetFraction: 0.95).compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: Self.echoGenerate)
+        let summary = try XCTUnwrap(out.first)
+        guard case .memory = summary.kind else { return XCTFail("expected floored memory summary") }
+        XCTAssertFalse(summary.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty,
+                       "summary floor must produce a non-empty brief")
+        XCTAssertLessThanOrEqual(tokens(out), budget())
+    }
+
+    /// Cancellation mid-summarise: cancelling the surrounding Task returns the
+    /// tail with NO summary record.
+    func testAnchoredCancellationMidSummarize() async throws {
+        let history = overflowingHistory()
+        // Hoist instance properties to locals so the Task closure doesn't
+        // capture (non-Sendable) `self`.
+        let ctx = contextSize
+        let reserve = reservedTokens
+        let task = Task { () -> [ChatMessage] in
+            try await AnchoredCompressionStrategy().compress(
+                history: history, contextSize: ctx, reservedTokens: reserve,
+                tokenizer: nil,
+                generate: { _ in
+                    // Yield so cancellation lands before/within summarise.
+                    try await Task.sleep(nanoseconds: 50_000_000)
+                    return "TOPIC: x\nKEY POINTS: a; b"
+                })
+        }
+        task.cancel()
+        let out = try await task.value
+        XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false },
+                       "cancellation must not inject a summary record")
+        XCTAssertEqual(out.last?.id, history.last?.id, "tail preserved on cancel")
+    }
+
+    /// `parseSummaryResponse` <2-field raw-fallback branch: a summary with only
+    /// one recognisable field degrades to the trimmed raw response, not an
+    /// empty/placeholder brief.
+    func testAnchoredSingleFieldRawFallback() async throws {
+        let history = overflowingHistory()
+        let oneField: @Sendable ([ChatMessage]) async throws -> String = { _ in
+            "TOPIC: only one field here and some prose that should survive verbatim"
+        }
+        let out = try await AnchoredCompressionStrategy().compress(
+            history: history, contextSize: contextSize, reservedTokens: reservedTokens,
+            tokenizer: nil, generate: oneField)
+        let summary = try XCTUnwrap(out.first)
+        guard case .memory = summary.kind else { return XCTFail("expected memory summary") }
+        XCTAssertTrue(summary.content.contains("prose that should survive"),
+                      "single-field response degrades to trimmed raw text")
+    }
+
     // MARK: - Policy thresholds & seam agreement
 
     func testShouldCompressHonorsThreshold() {
@@ -179,26 +418,42 @@ final class DefaultCompressionPolicyTests: XCTestCase {
                        "unknown context size never compresses")
     }
 
-    func testPreTurnAndPostTurnTriggersAgree() {
+    /// Trigger boundary, hand-computed. Both seams fire at exactly the
+    /// threshold and decline just below it; and the rounding asymmetry where a
+    /// post-turn caller passing a rounded utilisation fires while the pre-turn
+    /// recompute from raw tokens stays below.
+    func testTriggerAsymmetryBoundary() {
         let threshold = 0.80
         let policy = DefaultCompressionPolicy.truncating(threshold: threshold, contextSize: contextSize)
 
-        // Equivalent inputs: promptTokens = 0.85 * contextSize → both should fire.
-        let promptTokens = Int(0.85 * Double(contextSize))
-        let utilization = Double(promptTokens) / Double(contextSize)
-
-        let postTurn = policy.shouldCompress(
-            promptTokens: promptTokens, contextSize: contextSize, contextUtilization: utilization)
-        let preTurn = policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: promptTokens)
-        XCTAssertEqual(preTurn, postTurn)
-        XCTAssertTrue(preTurn)
-
-        // Below threshold: both decline.
-        let lowTokens = Int(0.50 * Double(contextSize))
-        XCTAssertEqual(
-            policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: lowTokens),
-            policy.shouldCompress(promptTokens: lowTokens, contextSize: contextSize,
-                                  contextUtilization: Double(lowTokens) / Double(contextSize)))
+        // promptTokens chosen so utilisation == threshold exactly.
+        let atTokens = Int(threshold * Double(contextSize))  // 0.80 * 2048 = 1638 (1638/2048 = 0.7998…)
+        // At the exact integer-token boundary the recomputed utilisation may be
+        // a hair below threshold; assert both seams agree with the hand-computed
+        // recompute regardless.
+        let atUtil = Double(atTokens) / Double(contextSize)
+        let preAt = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: atTokens)
+        let postAt = policy.shouldCompress(promptTokens: atTokens, contextSize: contextSize, contextUtilization: atUtil)
+        XCTAssertEqual(preAt, postAt)
+        XCTAssertEqual(preAt, atUtil >= threshold, "pre-turn matches hand-computed bool at boundary")
+
+        // threshold − epsilon: definitively below → both decline.
+        let belowTokens = Int((threshold - 0.01) * Double(contextSize))
+        let belowUtil = Double(belowTokens) / Double(contextSize)
+        XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: belowTokens))
+        XCTAssertFalse(policy.shouldCompress(promptTokens: belowTokens, contextSize: contextSize, contextUtilization: belowUtil))
+
+        // Rounding asymmetry: a post-turn caller that ROUNDS utilisation up to
+        // the threshold fires, while the pre-turn recompute from raw tokens
+        // (which is just under) does not. Demonstrates the documented seam gap.
+        let justUnderTokens = Int(threshold * Double(contextSize)) - 1  // 1637/2048 = 0.79931 < 0.80
+        let recomputed = Double(justUnderTokens) / Double(contextSize)
+        XCTAssertLessThan(recomputed, threshold, "raw recompute is below threshold")
+        let preJustUnder = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: justUnderTokens)
+        let postRounded = policy.shouldCompress(promptTokens: justUnderTokens, contextSize: contextSize,
+                                                contextUtilization: threshold)  // caller passes rounded value
+        XCTAssertFalse(preJustUnder, "pre-turn recompute declines just below threshold")
+        XCTAssertTrue(postRounded, "post-turn fires when caller passes a utilisation already at threshold")
     }
 
     func testPreTurnWithoutPriorTokensDoesNotCompress() {
@@ -207,8 +462,11 @@ final class DefaultCompressionPolicyTests: XCTestCase {
     }
 
     func testPolicyCompressDelegatesToStrategy() async throws {
-        let history = overflowingHistory()
-        let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize)
+        // Context comfortably above the default 2048 reserve so the policy's
+        // small-window guard doesn't skip; history overflows the resulting budget.
+        let largeContext = 8_192
+        let history = overflowingHistory(turns: 80, words: 120)
+        let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: largeContext)
         let out = try await policy.compress(history: history, sessionID: sessionID, generate: Self.echoGenerate)
         XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false })
 
@@ -216,4 +474,29 @@ final class DefaultCompressionPolicyTests: XCTestCase {
         let preOut = try await policy.compressBeforeTurn(history: history, sessionID: sessionID, generate: Self.echoGenerate)
         XCTAssertEqual(preOut.first?.kind.rawStorage, out.first?.kind.rawStorage)
     }
+
+    /// Finding 1 guard: a context window at or below the reservation has no
+    /// usable history budget; the policy must skip compression (return history
+    /// unchanged) rather than churn against a zero/negative budget.
+    func testPolicySkipsWhenContextSmallerThanReserve() async throws {
+        // 512-token simulator cap with the default 2048 reserve.
+        let policy = DefaultCompressionPolicy.truncating(contextSize: 512)
+        let history = overflowingHistory()
+        let out = try await policy.compress(history: history, sessionID: sessionID, generate: { _ in "" })
+        XCTAssertEqual(out.map(\.id), history.map(\.id), "no usable budget → history unchanged")
+    }
+
+    /// The default reserve is clearly larger than the legacy 512.
+    func testDefaultReserveIsLargerThanLegacy() {
+        XCTAssertGreaterThan(DefaultCompressionPolicy.defaultReservedTokens, 512)
+    }
+
+    /// Context-scaled reserve grows with the window but stays under half of it.
+    func testScaledReserveBounds() {
+        let small = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 4_096)
+        XCTAssertGreaterThanOrEqual(small, DefaultCompressionPolicy.defaultReservedTokens)
+        let large = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 131_072)
+        XCTAssertEqual(large, 131_072 / 8, "scales to ~12.5% of a big window")
+        XCTAssertLessThanOrEqual(large, 131_072 / 2, "never exceeds half the window")
+    }
 }