From eb504ca12c7b0f453cf9374308d5baf02828a933 Mon Sep 17 00:00:00 2001 From: Rory Ford Date: Mon, 15 Jun 2026 20:15:34 +1000 Subject: [PATCH 1/3] feat(runtime): batteries-included compression policies (truncating / extractive / anchored) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MK shipped the CompressionPolicy / PreTurnCompressionPolicy seams but no concrete strategy — every consumer had to hand-roll compress(). Add three default strategies plus a single policy wrapper that conforms to both seams. - TruncatingCompressionStrategy: zero-inference sliding window (baseline) - ExtractiveCompressionStrategy: zero-inference scored selection (recency / length / keyword-density) with verbatim tail + optional headBudgetFraction knob (middle-out / lost-in-the-middle preservation) - AnchoredCompressionStrategy: summarize old via generate, prepend a .memory("summary") record + keep a verbatim recency tail; chunk-and-fold for over-window input; falls back to extractive on failure/empty/missing-generate - DefaultCompressionPolicy: Sendable struct, conforms to BOTH CompressionPolicy and PreTurnCompressionPolicy; static factories .truncating/.extractive/.anchored Ported and rewritten from Fireside's StoryCompression suite against [ChatMessage]. Co-Authored-By: Claude Opus 4.8 --- .../AnchoredCompressionStrategy.swift | 278 ++++++++++++++++++ .../Compression/CompressionStrategy.swift | 84 ++++++ .../DefaultCompressionPolicy.swift | 136 +++++++++ .../ExtractiveCompressionStrategy.swift | 128 ++++++++ .../TruncatingCompressionStrategy.swift | 48 +++ .../DefaultCompressionPolicyTests.swift | 200 +++++++++++++ 6 files changed, 874 insertions(+) create mode 100644 Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift create mode 100644 Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift create mode 100644 Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift create mode 100644 Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift create mode 100644 Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift create mode 100644 Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift diff --git a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift new file mode 100644 index 00000000..9aef9270 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift @@ -0,0 +1,278 @@ +import Foundation +import ManifoldInference + +/// Summarises old messages via an inference call, then prepends the summary +/// (as a `.memory("summary")` record) to a verbatim recency tail. Falls back +/// to ``ExtractiveCompressionStrategy`` on any failure — failed or empty +/// inference (including a no-op summariser), or an oversized summary. +/// +/// Adapted from Fireside's `AnchoredStoryCompressor`, preserving its +/// hard-won behaviors: input-window decoupling (`summarizerInputWindow`), +/// chunk-and-fold for over-window input, a minimum-summary floor, and a +/// cancellation early-return. The summary prompt is passed to `generate` as a +/// single-message mini-conversation. +struct AnchoredCompressionStrategy: CompressionStrategy { + let name = "anchored" + + let tailBudgetFraction: Double + /// The summariser model's REAL usable window (tokens), used to size how + /// much old text it may READ. Decoupled from `contextSize`, which is the + /// small overflow *trigger* and the *output-brief* budget. `nil` sizes + /// input against `contextSize` (legacy behavior). + let summarizerInputWindow: Int? + /// Minimum tokens reserved for the output brief even when `contextSize` is + /// tiny — a short brief beats no brief. + let minSummaryBudget: Int + /// Tokens reserved for the summariser's own response when sizing input. + let summarizerResponseBuffer: Int + let summaryTemplate: String + + private let fallback = ExtractiveCompressionStrategy() + + /// Placeholder `{old_text}` is replaced with the concatenated old messages. + static let defaultSummaryTemplate = """ + Summarize the conversation so far. Be concise. Use only what is in the text. + + TOPIC: [main subject of the conversation, brief] + KEY POINTS: [up to 3 important points, semicolon-separated] + OPEN QUESTIONS: [unresolved items or pending decisions, if any] + LAST DISCUSSED: [most recent topic or conclusion, one sentence] + + Conversation: + {old_text} + """ + + init( + tailBudgetFraction: Double = 0.50, + summarizerInputWindow: Int? = nil, + minSummaryBudget: Int = 256, + summarizerResponseBuffer: Int = 512, + summaryTemplate: String? = nil + ) { + self.tailBudgetFraction = tailBudgetFraction + self.summarizerInputWindow = summarizerInputWindow + self.minSummaryBudget = minSummaryBudget + self.summarizerResponseBuffer = summarizerResponseBuffer + self.summaryTemplate = summaryTemplate ?? Self.defaultSummaryTemplate + } + + func compress( + history: [ChatMessage], + contextSize: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } + let originalTokens = tokens.reduce(0, +) + if originalTokens <= budget { + return history + } + + let sessionID = history.first?.sessionID ?? UUID() + let count = history.count + + // --- Verbatim tail: load-bearing records + newest within tailBudget --- + let tailBudget = Int(Double(budget) * tailBudgetFraction) + var tailIndices = Set() + var tailTokens = 0 + for i in 0.. usableInputBudget && usableInputBudget > 0 { + Log.inference.warning( + "[AnchoredCompression] input \(oldTextTokens) tok exceeds usable window \(usableInputBudget); chunk-and-folding \(oldMessages.count) old messages" + ) + prompt = await foldedSummaryPrompt( + oldMessages: oldMessages, chunkBudget: usableInputBudget, + sessionID: sessionID, generate: generate, tokenizer: tokenizer + ) + } else { + prompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: oldText) + } + + // --- Summarise (cancellation returns a minimal tail-only result) --- + let summaryText: String + do { + try Task.checkCancellation() + summaryText = try await generate([summaryMessage(prompt, sessionID: sessionID)]) + } catch is CancellationError { + return tailMessages + } catch { + Log.inference.debug("[AnchoredCompression] summarisation failed: \(error); falling back to extractive") + return try await fallback.compress( + history: history, contextSize: contextSize, + tokenizer: tokenizer, generate: generate + ) + } + + guard !summaryText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + return try await fallback.compress( + history: history, contextSize: contextSize, + tokenizer: tokenizer, generate: generate + ) + } + + // --- Assemble: summary record + verbatim tail --- + let parsed = parseSummaryResponse(summaryText) + let summaryTokens = ContextWindowManager.estimateTokenCount(parsed, tokenizer: tokenizer) + + var finalSummary = parsed + if summaryTokens + tailTokens > budget { + let rawSummaryBudget = budget - tailTokens + let summaryBudget = rawSummaryBudget > 0 ? rawSummaryBudget : min(minSummaryBudget, budget) + if rawSummaryBudget <= 0 { + Log.inference.warning( + "[AnchoredCompression] tail consumes the whole budget (tail=\(tailTokens) >= budget=\(budget)); emitting floored \(summaryBudget)-tok brief" + ) + } + let truncated = truncateToFit(parsed, budget: summaryBudget, tokenizer: tokenizer) + finalSummary = truncated.isEmpty ? String(parsed.prefix(200)) : truncated + } + + var output: [ChatMessage] = [summaryRecord(finalSummary, sessionID: sessionID)] + output.append(contentsOf: tailMessages) + return output + } + + // MARK: - Helpers + + private func summaryMessage(_ prompt: String, sessionID: UUID) -> ChatMessage { + ChatMessage(role: .user, content: prompt, sessionID: sessionID) + } + + private func summaryRecord(_ text: String, sessionID: UUID) -> ChatMessage { + ChatMessage(role: .system, content: text, sessionID: sessionID, kind: .memory("summary")) + } + + /// Extracts `FIELD: value` lines and reassembles them; falls back to a + /// trimmed raw response when fewer than two fields are present. + private func parseSummaryResponse(_ response: String) -> String { + let pattern = "^([A-Z][A-Z _]*[A-Z]):\\s*(.+)$" + let regex: NSRegularExpression + do { + regex = try NSRegularExpression(pattern: pattern, options: [.anchorsMatchLines, .caseInsensitive]) + } catch { + // The pattern is a compile-time constant; a failure here means a + // toolchain regression, not bad input. Surface it and fall back to + // the trimmed raw response rather than swallowing it silently. + Log.inference.error("[AnchoredCompression] summary regex failed to compile: \(error)") + let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400)) + } + let ns = response as NSString + var fields: [String] = [] + for match in regex.matches(in: response, range: NSRange(location: 0, length: ns.length)) where match.numberOfRanges >= 3 { + let name = ns.substring(with: match.range(at: 1)) + let value = ns.substring(with: match.range(at: 2)).trimmingCharacters(in: .whitespaces) + if !value.isEmpty { fields.append("\(name): \(value)") } + } + if fields.count >= 2 { return fields.joined(separator: "\n") } + let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400)) + } + + /// Summarises over-window input in chunks, then folds the chunk summaries + /// into one prompt so the WHOLE old text is covered rather than truncated. + private func foldedSummaryPrompt( + oldMessages: [ChatMessage], + chunkBudget: Int, + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String, + tokenizer: (any TokenizerProvider)? + ) async -> String { + let templateOverhead = ContextWindowManager.estimateTokenCount( + summaryTemplate.replacingOccurrences(of: "{old_text}", with: ""), + tokenizer: tokenizer + ) + let perChunkBudget = max(64, chunkBudget - templateOverhead) + + var chunks: [[ChatMessage]] = [] + var current: [ChatMessage] = [] + var currentTokens = 0 + for message in oldMessages { + let cost = estimateTokens(message, tokenizer: tokenizer) + if !current.isEmpty && currentTokens + cost > perChunkBudget { + chunks.append(current) + current = [] + currentTokens = 0 + } + current.append(message) + currentTokens += cost + } + if !current.isEmpty { chunks.append(current) } + + var summaries: [String] = [] + for chunk in chunks { + let chunkText = chunk.map(\.content).joined(separator: "\n\n") + let chunkPrompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: chunkText) + do { + let summary = try await generate([summaryMessage(chunkPrompt, sessionID: sessionID)]) + if summary.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer)) + } else { + summaries.append(parseSummaryResponse(summary)) + } + } catch { + // Preserve content even when a chunk's summarisation fails. + Log.inference.debug("[AnchoredCompression] chunk summarisation failed: \(error); keeping raw chunk") + summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer)) + } + } + return summaryTemplate.replacingOccurrences(of: "{old_text}", with: summaries.joined(separator: "\n\n")) + } + + /// Drops trailing words until the text fits the budget. + private func truncateToFit(_ text: String, budget: Int, tokenizer: (any TokenizerProvider)?) -> String { + if ContextWindowManager.estimateTokenCount(text, tokenizer: tokenizer) <= budget { + return text + } + var words = text.split(separator: " ", omittingEmptySubsequences: true) + while !words.isEmpty { + words.removeLast() + let candidate = words.joined(separator: " ") + if ContextWindowManager.estimateTokenCount(candidate, tokenizer: tokenizer) <= budget { + return candidate + } + } + return "" + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift new file mode 100644 index 00000000..47cc80c0 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift @@ -0,0 +1,84 @@ +import Foundation +import ManifoldInference + +/// A pure, stateless algorithm that reduces a chat history to fit a token +/// budget. Strategies are the reusable core shared by both compression seams +/// (``CompressionPolicy`` post-turn and ``PreTurnCompressionPolicy`` +/// pre-turn); ``DefaultCompressionPolicy`` wraps a strategy with the trigger +/// thresholds each protocol needs. +/// +/// Unlike the ``CompressionPolicy``/``PreTurnCompressionPolicy`` `compress` +/// methods — which receive only `history` + `sessionID` — a strategy also +/// takes the `contextSize` and `tokenizer` it sizes against. The policy holds +/// those as configuration and forwards them, because the protocol `compress` +/// signatures intentionally do not. +/// +/// ## System-prompt budgeting +/// +/// A strategy sees only the `history` array. The session system prompt lives +/// on `ChatSession.systemPrompt` and is *not* part of `history`, so the budget +/// is sized against `contextSize` minus a fixed response buffer. Any `.system` +/// role or `.memory` kind records that *are* in `history` are treated as +/// load-bearing and preserved verbatim by every strategy. +/// +/// Strategies are `Sendable` value types: `generate` is a parameter, never +/// stored, so there is no mutable summariser handle to guard. +protocol CompressionStrategy: Sendable { + /// Stable identifier recorded for diagnostics (e.g. `"truncating"`). + var name: String { get } + + /// Reduce `history` to fit `contextSize`. + /// + /// - Parameters: + /// - history: Full message history, oldest-first. + /// - contextSize: Backend context window in tokens. + /// - tokenizer: Optional tokenizer for cost estimation; heuristic when `nil`. + /// - generate: Inference call for strategies that summarise. Zero-inference + /// strategies ignore it. A closure that yields empty text signals "no + /// usable summariser" — summarising strategies fall back accordingly. + /// - Returns: The replacement history, oldest-first. Never empty when + /// `history` is non-empty. + func compress( + history: [ChatMessage], + contextSize: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] +} + +// MARK: - Shared helpers + +extension CompressionStrategy { + /// Tokens reserved for the model's own response when sizing history. + var responseBuffer: Int { 512 } + + /// Per-message token estimate. + func estimateTokens(_ message: ChatMessage, tokenizer: (any TokenizerProvider)?) -> Int { + ContextWindowManager.estimateTokenCount(message.content, tokenizer: tokenizer) + } + + /// Total tokens across a message set. + func estimateTokens(_ messages: [ChatMessage], tokenizer: (any TokenizerProvider)?) -> Int { + messages.reduce(0) { $0 + estimateTokens($1, tokenizer: tokenizer) } + } + + /// Token budget available for history: context window minus the response + /// buffer. The session system prompt is not visible here (it is not part + /// of `history`), so it is not subtracted. + func historyBudget(contextSize: Int, tokenizer: (any TokenizerProvider)?) -> Int { + max(0, contextSize - responseBuffer) + } + + /// `true` for records that must survive compression regardless of budget: + /// `.system`-role prompt fragments and `.memory` summarisation artifacts + /// (a prior compression brief must not be evicted by a later pass). + /// + // TODO: per-message pinning is not visible to a `[ChatMessage]`-only + // policy — `ChatSession.pinnedMessageIDs` lives on the session. When the + // seam grows a pinned-IDs channel, honor it here. + func isLoadBearing(_ message: ChatMessage) -> Bool { + if message.role == .system { return true } + if case .memory = message.kind { return true } + return false + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift new file mode 100644 index 00000000..a295a8ac --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift @@ -0,0 +1,136 @@ +import Foundation +import ManifoldInference + +/// Batteries-included compression policy. Pairs a ``CompressionStrategy`` with +/// a utilisation threshold and conforms to **both** compression seams — +/// ``CompressionPolicy`` (post-turn) and ``PreTurnCompressionPolicy`` +/// (pre-turn) — so one value drives either, or both, without a bespoke +/// conformance. +/// +/// Construct via the strategy factories rather than the initializer: +/// +/// ```swift +/// // Zero-inference, smart selection. Compress at 75% context utilisation. +/// runtime.compressionPolicy = .extractive(threshold: 0.75, contextSize: 8_192) +/// +/// // Inference-backed summary of old turns, kept-recent tail. Pre-turn seam. +/// runtime.preTurnCompressionPolicy = .anchored(threshold: 0.85, contextSize: 8_192) +/// +/// // Cheapest baseline: drop oldest, keep system/summary records + newest. +/// runtime.compressionPolicy = .truncating(threshold: 0.90, contextSize: 4_096) +/// ``` +/// +/// ## Trigger asymmetry +/// +/// ``CompressionPolicy/shouldCompress(promptTokens:contextSize:contextUtilization:)`` +/// receives utilisation directly. ``PreTurnCompressionPolicy`` does not — its +/// `shouldCompressBeforeTurn` sees only `messageCount` and `lastPromptTokens`, +/// so this policy stores `contextSize` and computes utilisation from +/// `lastPromptTokens / contextSize`. Given equivalent inputs the two seams +/// agree. +public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPolicy { + private let strategy: any CompressionStrategy + + /// Context-utilisation ratio at or above which compression triggers. + public let threshold: Double + /// Backend context window (tokens). Held as configuration because the + /// pre-turn seam does not pass it and the strategy needs it to size budget. + public let contextSize: Int + private let tokenizer: (any TokenizerProvider)? + + init( + strategy: any CompressionStrategy, + threshold: Double, + contextSize: Int, + tokenizer: (any TokenizerProvider)? + ) { + self.strategy = strategy + self.threshold = threshold + self.contextSize = contextSize + self.tokenizer = tokenizer + } + + // MARK: - Factories + + /// Zero-inference sliding window: keep system/summary records + the newest + /// messages that fit, drop the oldest. + public static func truncating( + threshold: Double = 0.90, + contextSize: Int, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: TruncatingCompressionStrategy(), + threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + ) + } + + /// Zero-inference scored selection (recency / length / keyword density). + /// `headBudgetFraction > 0` pins the oldest messages to counter the + /// "lost in the middle" effect. + public static func extractive( + threshold: Double = 0.75, + headBudgetFraction: Double = 0.0, + contextSize: Int, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: ExtractiveCompressionStrategy(headBudgetFraction: headBudgetFraction), + threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + ) + } + + /// Inference-backed summary of old turns prepended to a verbatim recent + /// tail. Falls back to extractive when no summary can be produced. + /// + /// - Parameter summarizerInputWindow: the summariser's REAL window, used to + /// size how much old text it reads — set this to the backend's true + /// context size when `contextSize` is a small overflow trigger. + public static func anchored( + threshold: Double = 0.85, + contextSize: Int, + summarizerInputWindow: Int? = nil, + summaryTemplate: String? = nil, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: AnchoredCompressionStrategy( + summarizerInputWindow: summarizerInputWindow, + summaryTemplate: summaryTemplate + ), + threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + ) + } + + // MARK: - CompressionPolicy (post-turn) + + public func shouldCompress(promptTokens: Int, contextSize: Int, contextUtilization: Double) -> Bool { + contextSize > 0 && contextUtilization >= threshold + } + + public func compress( + history: [ChatMessage], + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + try await strategy.compress( + history: history, contextSize: contextSize, + tokenizer: tokenizer, generate: generate + ) + } + + // MARK: - PreTurnCompressionPolicy (pre-turn) + + public func shouldCompressBeforeTurn(messageCount: Int, lastPromptTokens: Int?) -> Bool { + guard contextSize > 0, let promptTokens = lastPromptTokens else { return false } + return Double(promptTokens) / Double(contextSize) >= threshold + } + + public func compressBeforeTurn( + history: [ChatMessage], + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + try await compress(history: history, sessionID: sessionID, generate: generate) + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift new file mode 100644 index 00000000..8332a5c0 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift @@ -0,0 +1,128 @@ +import Foundation +import ManifoldInference + +/// Zero-inference scored selection. Reserves a verbatim recency tail (and, +/// optionally, a verbatim head for establishing context), then greedily fills +/// the remaining budget with the highest-scoring older messages. Score blends +/// recency, content length, and capitalized-word density (a cheap proper-noun +/// proxy). +/// +/// Adapted from Fireside's `ExtractiveStoryCompressor`. The `headBudgetFraction` +/// knob is new: pinning the oldest messages counters the "lost in the middle" +/// effect, where models attend most to the start and end of context. +struct ExtractiveCompressionStrategy: CompressionStrategy { + let name = "extractive" + + /// Fraction of the budget reserved for the verbatim recency tail. + let tailBudgetFraction: Double + /// Fraction of the budget reserved for a verbatim head (oldest messages). + /// `0` disables head preservation (Fireside's original behavior). + let headBudgetFraction: Double + let recencyWeight: Double + let lengthWeight: Double + let keywordDensityWeight: Double + + init( + tailBudgetFraction: Double = 0.40, + headBudgetFraction: Double = 0.0, + recencyWeight: Double = 0.5, + lengthWeight: Double = 0.3, + keywordDensityWeight: Double = 0.2 + ) { + self.tailBudgetFraction = tailBudgetFraction + self.headBudgetFraction = headBudgetFraction + self.recencyWeight = recencyWeight + self.lengthWeight = lengthWeight + self.keywordDensityWeight = keywordDensityWeight + } + + func compress( + history: [ChatMessage], + contextSize: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } + let originalTokens = tokens.reduce(0, +) + + // Everything fits, or a single message: never evict all history. + if originalTokens <= budget || history.count == 1 { + return history + } + + let count = history.count + var keep = Set() + var used = 0 + + // Load-bearing records are always kept. + for i in 0..= tailBudget { break } + } + // Always preserve the newest message. + if !keep.contains(count - 1) { + keep.insert(count - 1) + used += tokens[count - 1] + } + + // --- Verbatim head (oldest) — anti "lost in the middle" --- + if headBudgetFraction > 0 { + let headBudget = Int(Double(budget) * headBudgetFraction) + var headUsed = 0 + for i in 0..= headBudget { break } + } + } + + // --- Score and greedily select the remainder --- + struct Scored { let index: Int; let score: Double; let tokens: Int } + var candidates: [Scored] = [] + for i in 0.. 1 ? Double(i) / Double(count - 1) : 1.0 + let length = min(1.0, Double(tokens[i]) / 200.0) + let density = keywordDensity(of: history[i].content) + let score = recency * recencyWeight + length * lengthWeight + density * keywordDensityWeight + candidates.append(Scored(index: i, score: score, tokens: tokens[i])) + } + candidates.sort { $0.score > $1.score } + + for candidate in candidates { + if used + candidate.tokens <= budget { + keep.insert(candidate.index) + used += candidate.tokens + } + } + + return keep.sorted().map { history[$0] } + } + + /// Ratio of capitalized words to total words — a rough proper-noun proxy. + private func keywordDensity(of content: String) -> Double { + let words = content.split(whereSeparator: { $0.isWhitespace }) + guard !words.isEmpty else { return 0 } + let capitalized = words.filter { $0.first?.isUppercase == true }.count + return Double(capitalized) / Double(words.count) + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift new file mode 100644 index 00000000..467d4cf5 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift @@ -0,0 +1,48 @@ +import Foundation +import ManifoldInference + +/// Zero-inference sliding window. Keeps every load-bearing record +/// (`.system` / `.memory`) plus the newest messages that fit the budget, +/// dropping the oldest. The cheapest possible strategy and a safe default when +/// no summariser is available. +struct TruncatingCompressionStrategy: CompressionStrategy { + let name = "truncating" + + func compress( + history: [ChatMessage], + contextSize: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + if estimateTokens(history, tokenizer: tokenizer) <= budget { + return history + } + + var kept = Set() + var used = 0 + + // Load-bearing records are always retained. + for (i, message) in history.enumerated() where isLoadBearing(message) { + kept.insert(i) + used += estimateTokens(message, tokenizer: tokenizer) + } + + // Fill remaining budget with the newest non-kept messages. + for i in stride(from: history.count - 1, through: 0, by: -1) { + if kept.contains(i) { continue } + let cost = estimateTokens(history[i], tokenizer: tokenizer) + if kept.isEmpty || used + cost <= budget { + kept.insert(i) + used += cost + } + } + + // Invariant: never drop the newest message. + kept.insert(history.count - 1) + + return kept.sorted().map { history[$0] } + } +} diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift new file mode 100644 index 00000000..31f603cc --- /dev/null +++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift @@ -0,0 +1,200 @@ +@preconcurrency import XCTest +import Foundation +@testable import ManifoldRuntime +@testable import ManifoldInference + +/// Unit coverage for the batteries-included compression strategies and the +/// ``DefaultCompressionPolicy`` wrapper. Pure value transforms — no SwiftData, +/// no runtime wiring (that lives in `CompressionPolicyTests`). +final class DefaultCompressionPolicyTests: XCTestCase { + + // MARK: - Fixtures + + private let sessionID = UUID() + + /// Builds a message whose token cost scales with `words`. + private func msg(_ role: MessageRole, words: Int, kind: MessageKind = .chat) -> ChatMessage { + let content = Array(repeating: "lorem", count: words).joined(separator: " ") + return ChatMessage(role: role, content: content, sessionID: sessionID, kind: kind) + } + + /// Long alternating conversation that overflows `contextSize`. + private func overflowingHistory(turns: Int = 12, words: Int = 120) -> [ChatMessage] { + (0.. Int { max(0, contextSize - 512) } + private func tokens(_ messages: [ChatMessage]) -> Int { + messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1.content, tokenizer: nil) } + } + + private static func echoGenerate(_: [ChatMessage]) async throws -> String { + "TOPIC: testing\nKEY POINTS: a; b; c\nLAST DISCUSSED: the end" + } + + // MARK: - Truncating + + func testTruncatingLeavesSmallHistoryUntouched() async throws { + let history = [msg(.user, words: 5), msg(.assistant, words: 5)] + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + XCTAssertEqual(out.map(\.id), history.map(\.id)) + } + + func testTruncatingDropsOldestAndKeepsNewest() async throws { + let history = overflowingHistory() + XCTAssertGreaterThan(tokens(history), budget()) // precondition: actually overflows + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + + XCTAssertLessThan(out.count, history.count, "expected eviction") + XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive") + XCTAssertFalse(out.contains { $0.id == history.first?.id }, "oldest should be dropped") + XCTAssertLessThanOrEqual(tokens(out), budget()) + } + + func testTruncatingPreservesLoadBearingRecords() async throws { + var history = [msg(.system, words: 10, kind: .chat)] // system role + history.append(msg(.assistant, words: 10, kind: .memory("summary"))) // memory kind + history.append(contentsOf: overflowingHistory()) + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + + XCTAssertTrue(out.contains { $0.role == .system }, "system prompt must survive") + XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }, + "prior summary must survive") + } + + // MARK: - Extractive + + func testExtractiveReducesBelowBudgetAndKeepsNewest() async throws { + let history = overflowingHistory() + let out = try await ExtractiveCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + + XCTAssertLessThanOrEqual(tokens(out), budget()) + XCTAssertEqual(out.last?.id, history.last?.id) + XCTAssertEqual(out.map(\.id), out.map(\.id).sorted { a, b in + (history.firstIndex { $0.id == a } ?? 0) < (history.firstIndex { $0.id == b } ?? 0) + }, "output must stay chronological") + } + + func testExtractiveSingleMessageNeverEvicted() async throws { + let history = [msg(.user, words: 5_000)] // alone but over budget + let out = try await ExtractiveCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + XCTAssertEqual(out.count, 1) + } + + func testExtractiveHeadBudgetPreservesOldest() async throws { + let history = overflowingHistory(turns: 16, words: 120) + let oldestID = history.first!.id + + let withoutHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.0).compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + let withHead = try await ExtractiveCompressionStrategy(headBudgetFraction: 0.30).compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + + // The head knob guarantees the oldest establishing message survives. + XCTAssertTrue(withHead.contains { $0.id == oldestID }, "head budget must retain the oldest message") + XCTAssertFalse(withoutHead.contains { $0.id == oldestID }, "without head budget the oldest is evictable") + } + + // MARK: - Anchored + + func testAnchoredPrependsMemorySummary() async throws { + let history = overflowingHistory() + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: Self.echoGenerate) + + let first = try XCTUnwrap(out.first) + XCTAssertEqual(first.role, .system) + guard case .memory(let label) = first.kind else { + return XCTFail("first record must be a .memory summary") + } + XCTAssertEqual(label, "summary") + XCTAssertEqual(out.last?.id, history.last?.id, "verbatim tail preserved") + } + + func testAnchoredFallsBackToExtractiveWhenGenerateFails() async throws { + struct Boom: Error {} + let history = overflowingHistory() + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, + generate: { _ in throw Boom() }) + + // Fallback produces a reduced history with NO injected summary record. + XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false }) + XCTAssertLessThanOrEqual(tokens(out), budget()) + XCTAssertFalse(out.isEmpty) + } + + func testAnchoredFallsBackOnEmptySummary() async throws { + let history = overflowingHistory() + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, + generate: { _ in " " }) + XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false }) + } + + func testAnchoredWithoutGenerateFallsBack() async throws { + let history = overflowingHistory() + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + XCTAssertFalse(out.isEmpty) + XCTAssertLessThanOrEqual(tokens(out), budget()) + } + + // MARK: - Policy thresholds & seam agreement + + func testShouldCompressHonorsThreshold() { + let policy = DefaultCompressionPolicy.extractive(threshold: 0.75, contextSize: contextSize) + XCTAssertTrue(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.80)) + XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.50)) + XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: 0, contextUtilization: 0.99), + "unknown context size never compresses") + } + + func testPreTurnAndPostTurnTriggersAgree() { + let threshold = 0.80 + let policy = DefaultCompressionPolicy.truncating(threshold: threshold, contextSize: contextSize) + + // Equivalent inputs: promptTokens = 0.85 * contextSize → both should fire. + let promptTokens = Int(0.85 * Double(contextSize)) + let utilization = Double(promptTokens) / Double(contextSize) + + let postTurn = policy.shouldCompress( + promptTokens: promptTokens, contextSize: contextSize, contextUtilization: utilization) + let preTurn = policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: promptTokens) + XCTAssertEqual(preTurn, postTurn) + XCTAssertTrue(preTurn) + + // Below threshold: both decline. + let lowTokens = Int(0.50 * Double(contextSize)) + XCTAssertEqual( + policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: lowTokens), + policy.shouldCompress(promptTokens: lowTokens, contextSize: contextSize, + contextUtilization: Double(lowTokens) / Double(contextSize))) + } + + func testPreTurnWithoutPriorTokensDoesNotCompress() { + let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize) + XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 500, lastPromptTokens: nil)) + } + + func testPolicyCompressDelegatesToStrategy() async throws { + let history = overflowingHistory() + let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize) + let out = try await policy.compress(history: history, sessionID: sessionID, generate: Self.echoGenerate) + XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }) + + // compressBeforeTurn shares the same path. + let preOut = try await policy.compressBeforeTurn(history: history, sessionID: sessionID, generate: Self.echoGenerate) + XCTAssertEqual(preOut.first?.kind.rawStorage, out.first?.kind.rawStorage) + } +} From 477830baa56c444fc4a9e82605eb0275eef703e9 Mon Sep 17 00:00:00 2001 From: Rory Ford Date: Mon, 15 Jun 2026 20:45:30 +1000 Subject: [PATCH 2/3] fix(runtime): review fixes for compression policies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a truncating-strategy test that guards the never-drop-newest invariant under load-bearing-overflow — the existing tests passed even with that line removed (the greedy backward fill already keeps the newest in the common case), so the invariant was effectively untested. Sabotage-verified. Co-Authored-By: Claude Opus 4.8 --- .../DefaultCompressionPolicyTests.swift | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift index 31f603cc..cc9aaf3f 100644 --- a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift +++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift @@ -57,6 +57,25 @@ final class DefaultCompressionPolicyTests: XCTestCase { XCTAssertLessThanOrEqual(tokens(out), budget()) } + /// When load-bearing records alone exhaust the budget, the greedy backward + /// fill cannot admit the newest chat message — only the explicit + /// never-drop-newest invariant keeps it. Guards that invariant directly + /// (the over-budget-tail path the other truncating tests don't exercise). + func testTruncatingKeepsNewestEvenWhenLoadBearingExceedsBudget() async throws { + let history = [ + msg(.system, words: 4_000, kind: .chat), // load-bearing, alone over budget + msg(.user, words: 5), + msg(.assistant, words: 5) // newest, tiny + ] + XCTAssertGreaterThan(tokens([history[0]]), budget(), "precondition: load-bearing alone overflows") + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + + XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive even when load-bearing fills the budget") + XCTAssertTrue(out.contains { $0.role == .system }, "load-bearing record retained") + } + func testTruncatingPreservesLoadBearingRecords() async throws { var history = [msg(.system, words: 10, kind: .chat)] // system role history.append(msg(.assistant, words: 10, kind: .memory("summary"))) // memory kind From 30116489488d7b803e07fc394dd2615b2d71f98c Mon Sep 17 00:00:00 2001 From: Rory Ford Date: Mon, 15 Jun 2026 21:24:22 +1000 Subject: [PATCH 3/3] fix(runtime): address review findings on compression policies Budget realism (configurable reservedTokens replacing the bare 512, injectable tokenizer on the factories, skip-on-tiny-window guard), extractive verbatim-core overflow clamp, thinking-model robustness (strip before summary parse, configurable response reserve), multimodal/tool per-part token accounting in ContextWindowManager, plus the QA test set (asymmetry boundary, chunk-and-fold, summary-floor, cancellation, tightened weak assertions) and doc fixes. Co-Authored-By: Claude Opus 4.8 --- .../Services/ContextWindowManager.swift | 59 ++- .../Protocols/CompressionPolicy.swift | 11 + .../AnchoredCompressionStrategy.swift | 52 ++- .../Compression/CompressionStrategy.swift | 59 ++- .../DefaultCompressionPolicy.swift | 125 +++++- .../ExtractiveCompressionStrategy.swift | 46 ++- .../TruncatingCompressionStrategy.swift | 3 +- .../ContextWindowManagerTests.swift | 62 +++ .../DefaultCompressionPolicyTests.swift | 363 ++++++++++++++++-- 9 files changed, 691 insertions(+), 89 deletions(-) diff --git a/Sources/ManifoldInference/Services/ContextWindowManager.swift b/Sources/ManifoldInference/Services/ContextWindowManager.swift index c70aa298..c04a09e6 100644 --- a/Sources/ManifoldInference/Services/ContextWindowManager.swift +++ b/Sources/ManifoldInference/Services/ContextWindowManager.swift @@ -71,6 +71,60 @@ public enum ContextWindowManager { return HeuristicTokenizer().tokenCount(text) } + /// Fixed per-part token cost for a non-text modality whose true token + /// footprint isn't visible from the `[ChatMessage]` value alone. + /// + /// `.content` only exposes `.text` parts, so an image/audio part would + /// otherwise estimate as **zero** — a multimodal turn could then sail past + /// the compression trigger and overflow at generation. These are coarse + /// placeholders, not provider-accurate counts (a single image is ~85–1.5k + /// tokens depending on resolution / provider), deliberately sized to be + /// "clearly non-zero, never catastrophically wrong": they make the budget + /// math notice the part exists. A backend that knows its real tiling cost + /// should account for that at the wire layer, not here. + static let imagePartTokenEstimate = 768 + static let audioPartTokenEstimate = 384 + static let generatedMediaPartTokenEstimate = 256 + + /// Estimates the token cost of a single `MessagePart`. + /// + /// - `.text` / `.thinking`: counted via the tokenizer (heuristic when nil). + /// - `.image` / `.audio` / `.generatedMedia`: a documented fixed estimate + /// (``imagePartTokenEstimate`` etc.) since the byte payload's true token + /// footprint isn't derivable here. + /// - `.toolCall`: the tool name plus its serialized JSON arguments. + /// - `.toolResult`: the serialized result payload (content + dialog). + public static func estimateTokenCount(_ part: MessagePart, tokenizer: TokenizerProvider? = nil) -> Int { + switch part { + case .text(let t): + return estimateTokenCount(t, tokenizer: tokenizer) + case .thinking(let t, _): + return estimateTokenCount(t, tokenizer: tokenizer) + case .image: + return imagePartTokenEstimate + case .audio: + return audioPartTokenEstimate + case .generatedMedia: + return generatedMediaPartTokenEstimate + case .toolCall(let call): + return estimateTokenCount("\(call.toolName) \(call.arguments)", tokenizer: tokenizer) + case .toolResult(let result): + let payload = [result.content, result.dialog].compactMap { $0 }.joined(separator: " ") + return estimateTokenCount(payload, tokenizer: tokenizer) + } + } + + /// Estimates the token cost of a whole message by summing every content + /// part — text, reasoning, multimodal, and tool parts alike. + /// + /// Prefer this over `estimateTokenCount(message.content, …)`: `.content` + /// discards everything but `.text` parts, so the string overload silently + /// under-counts (to zero) image/audio/tool-only messages and lets them + /// overflow the window. Sums across `contentParts` instead. + public static func estimateTokenCount(_ message: ChatMessage, tokenizer: TokenizerProvider? = nil) -> Int { + message.contentParts.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) } + } + /// Resolves the effective context size from available sources. /// /// Priority: session override > model metadata > backend capabilities > default. @@ -121,7 +175,8 @@ public enum ContextWindowManager { var usedTokens = 0 for i in stride(from: messages.count - 1, through: 0, by: -1) { - let messageTokens = estimateTokenCount(messages[i].content, tokenizer: tokenizer) + // Sum across all parts — `.content` would miss image/audio/tool parts. + let messageTokens = estimateTokenCount(messages[i], tokenizer: tokenizer) if usedTokens + messageTokens > available && firstKeptIndex < messages.endIndex { break } @@ -141,7 +196,7 @@ public enum ContextWindowManager { tokenizer: TokenizerProvider? = nil ) -> ContextBudget { let systemTokens = estimateTokenCount(systemPrompt ?? "", tokenizer: tokenizer) - let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1.content, tokenizer: tokenizer) } + let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) } let availableForHistory = maxTokens - systemTokens - responseBuffer return ContextBudget( diff --git a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift index cdd303c6..314a9212 100644 --- a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift +++ b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift @@ -11,6 +11,17 @@ import ManifoldInference /// Compression failures are logged and do not abort the turn — the existing /// history is preserved. /// +/// ## Per-message pins are not yet threaded through this seam +/// +/// `compress(history:sessionID:generate:)` passes only a `[ChatMessage]` and +/// the `sessionID` — **not** the set of user-pinned message IDs. The data +/// already exists (`ChatSession.pinnedMessageIDsRaw` / `pinnedMessageIDs` on +/// the session record), but honoring pins inside a policy requires a +/// protocol-signature change to carry the pinned-ID set (a new parameter or a +/// session handle). Until that lands, ``DefaultCompressionPolicy`` treats only +/// `.system`-role and `.memory`-kind records as load-bearing; explicit +/// per-message pins are not preserved across compression. +/// /// ## v0.26.0 Migration /// /// The `shouldCompress` signature gained a `contextUtilization` parameter in v0.26.0. diff --git a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift index 9aef9270..470232ec 100644 --- a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift +++ b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift @@ -11,6 +11,16 @@ import ManifoldInference /// chunk-and-fold for over-window input, a minimum-summary floor, and a /// cancellation early-return. The summary prompt is passed to `generate` as a /// single-message mini-conversation. +/// +// TODO(#1885, optional P2): collapse/fold a prior `.memory("summary")` record +// into the new summarisation pass instead of pinning it (as load-bearing) into +// the tail and prepending a SECOND inline summary each cycle. Across many +// compression cycles the inline summary blocks stack. Doing this right means +// detecting the prior summary, excluding it from the verbatim tail, and +// feeding its text into the summariser input alongside the old messages — a +// non-trivial change to the tail/old-message partition. Deliberately deferred +// to keep this fix-round diff bounded; the stack is bounded in practice by the +// budget enforcement and is correctness-neutral (just less compact). struct AnchoredCompressionStrategy: CompressionStrategy { let name = "anchored" @@ -23,7 +33,10 @@ struct AnchoredCompressionStrategy: CompressionStrategy { /// Minimum tokens reserved for the output brief even when `contextSize` is /// tiny — a short brief beats no brief. let minSummaryBudget: Int - /// Tokens reserved for the summariser's own response when sizing input. + /// Tokens reserved for the summariser's own response when sizing the INPUT + /// window. This is the SAME reservation knob as the policy's + /// `reservedTokens` (the factory threads `reservedTokens` here), so there is + /// one source of truth — there is no longer a separate hard-coded buffer. let summarizerResponseBuffer: Int let summaryTemplate: String @@ -44,9 +57,9 @@ struct AnchoredCompressionStrategy: CompressionStrategy { init( tailBudgetFraction: Double = 0.50, + summarizerResponseBuffer: Int = DefaultCompressionPolicy.defaultReservedTokens, summarizerInputWindow: Int? = nil, minSummaryBudget: Int = 256, - summarizerResponseBuffer: Int = 512, summaryTemplate: String? = nil ) { self.tailBudgetFraction = tailBudgetFraction @@ -59,12 +72,13 @@ struct AnchoredCompressionStrategy: CompressionStrategy { func compress( history: [ChatMessage], contextSize: Int, + reservedTokens: Int, tokenizer: (any TokenizerProvider)?, generate: @Sendable ([ChatMessage]) async throws -> String ) async throws -> [ChatMessage] { guard !history.isEmpty else { return [] } - let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } let originalTokens = tokens.reduce(0, +) if originalTokens <= budget { @@ -140,14 +154,14 @@ struct AnchoredCompressionStrategy: CompressionStrategy { Log.inference.debug("[AnchoredCompression] summarisation failed: \(error); falling back to extractive") return try await fallback.compress( history: history, contextSize: contextSize, - tokenizer: tokenizer, generate: generate + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate ) } guard !summaryText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { return try await fallback.compress( history: history, contextSize: contextSize, - tokenizer: tokenizer, generate: generate + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate ) } @@ -183,9 +197,33 @@ struct AnchoredCompressionStrategy: CompressionStrategy { ChatMessage(role: .system, content: text, sessionID: sessionID, kind: .memory("summary")) } + /// Strips leaked chain-of-thought (``, `…`) + /// before parsing, reusing MK's ``ThinkingTransform`` rather than + /// hand-rolling marker logic. A reasoning model can emit its scratchpad + /// ahead of the brief; left in, that scratchpad would be parsed as the + /// "summary" and injected verbatim into the compressed history. We run the + /// two common marker families (Qwen/DeepSeek ``, Mistral/Sky-T1 + /// ``) in sequence and keep only the visible `.token` text. + private func stripThinking(_ text: String) -> String { + var result = text + for markers in [ThinkingMarkers.qwen3, ThinkingMarkers.mistralReasoning] { + var transform = ThinkingTransform(markers: markers) + var events = transform.process([.token(result)]) + events += transform.finalize() + let visible = events.compactMap { event -> String? in + if case .token(let t) = event { return t } + return nil + }.joined() + result = visible + } + return result + } + /// Extracts `FIELD: value` lines and reassembles them; falls back to a - /// trimmed raw response when fewer than two fields are present. - private func parseSummaryResponse(_ response: String) -> String { + /// trimmed raw response when fewer than two fields are present. Strips + /// leaked reasoning first so chain-of-thought can't masquerade as a summary. + private func parseSummaryResponse(_ rawResponse: String) -> String { + let response = stripThinking(rawResponse) let pattern = "^([A-Z][A-Z _]*[A-Z]):\\s*(.+)$" let regex: NSRegularExpression do { diff --git a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift index 47cc80c0..ab99e8a3 100644 --- a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift +++ b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift @@ -13,13 +13,19 @@ import ManifoldInference /// those as configuration and forwards them, because the protocol `compress` /// signatures intentionally do not. /// -/// ## System-prompt budgeting +/// ## System-prompt budgeting (seam constraint) /// /// A strategy sees only the `history` array. The session system prompt lives -/// on `ChatSession.systemPrompt` and is *not* part of `history`, so the budget -/// is sized against `contextSize` minus a fixed response buffer. Any `.system` -/// role or `.memory` kind records that *are* in `history` are treated as -/// load-bearing and preserved verbatim by every strategy. +/// on `ChatSession.systemPrompt` and is *not* part of `history`, and the +/// `CompressionPolicy` / `PreTurnCompressionPolicy` protocol `compress` +/// signatures pass **neither** a system prompt nor a tokenizer — so a strategy +/// cannot subtract a *real* system-prompt token count and cannot read a live +/// tokenizer. Both are therefore **configuration** on ``DefaultCompressionPolicy`` +/// (set via the factories) and forwarded in: `reservedTokens` (response +/// headroom + a system-prompt allowance) and an optional `tokenizer`. The +/// budget is `contextSize − reservedTokens`. Any `.system`-role or `.memory`-kind +/// records that *are* in `history` are treated as load-bearing and preserved +/// verbatim by every strategy. /// /// Strategies are `Sendable` value types: `generate` is a parameter, never /// stored, so there is no mutable summariser handle to guard. @@ -32,7 +38,13 @@ protocol CompressionStrategy: Sendable { /// - Parameters: /// - history: Full message history, oldest-first. /// - contextSize: Backend context window in tokens. - /// - tokenizer: Optional tokenizer for cost estimation; heuristic when `nil`. + /// - reservedTokens: Tokens carved out of `contextSize` before history is + /// sized — response headroom **plus** a system-prompt allowance (the + /// real prompt isn't visible here). Single source of truth for the + /// reservation across all strategies. + /// - tokenizer: Optional tokenizer for cost estimation; heuristic + /// (chars/4) when `nil`, in which case the budget is **advisory**, not + /// guaranteed to match the backend's real token count. /// - generate: Inference call for strategies that summarise. Zero-inference /// strategies ignore it. A closure that yields empty text signals "no /// usable summariser" — summarising strategies fall back accordingly. @@ -41,6 +53,7 @@ protocol CompressionStrategy: Sendable { func compress( history: [ChatMessage], contextSize: Int, + reservedTokens: Int, tokenizer: (any TokenizerProvider)?, generate: @Sendable ([ChatMessage]) async throws -> String ) async throws -> [ChatMessage] @@ -49,12 +62,11 @@ protocol CompressionStrategy: Sendable { // MARK: - Shared helpers extension CompressionStrategy { - /// Tokens reserved for the model's own response when sizing history. - var responseBuffer: Int { 512 } - - /// Per-message token estimate. + /// Per-message token estimate — sums across ALL content parts (text, + /// reasoning, image/audio, tool-call/-result), not just `.text`, so a + /// multimodal or tool-only message isn't counted as zero (#1885 finding 9). func estimateTokens(_ message: ChatMessage, tokenizer: (any TokenizerProvider)?) -> Int { - ContextWindowManager.estimateTokenCount(message.content, tokenizer: tokenizer) + ContextWindowManager.estimateTokenCount(message, tokenizer: tokenizer) } /// Total tokens across a message set. @@ -62,20 +74,29 @@ extension CompressionStrategy { messages.reduce(0) { $0 + estimateTokens($1, tokenizer: tokenizer) } } - /// Token budget available for history: context window minus the response - /// buffer. The session system prompt is not visible here (it is not part - /// of `history`), so it is not subtracted. - func historyBudget(contextSize: Int, tokenizer: (any TokenizerProvider)?) -> Int { - max(0, contextSize - responseBuffer) + /// Token budget available for history: context window minus `reservedTokens` + /// (response headroom + system-prompt allowance). The session system prompt + /// is not visible here, so its real cost can't be subtracted — the + /// allowance baked into `reservedTokens` covers it. Returns `0` when the + /// reservation meets or exceeds the window; callers must guard that case + /// (`reservedTokens >= contextSize`) and skip compression rather than churn + /// against a zero budget. + func historyBudget(contextSize: Int, reservedTokens: Int) -> Int { + max(0, contextSize - reservedTokens) } /// `true` for records that must survive compression regardless of budget: /// `.system`-role prompt fragments and `.memory` summarisation artifacts /// (a prior compression brief must not be evicted by a later pass). /// - // TODO: per-message pinning is not visible to a `[ChatMessage]`-only - // policy — `ChatSession.pinnedMessageIDs` lives on the session. When the - // seam grows a pinned-IDs channel, honor it here. + // TODO(#1885): honor per-message pins. The data already exists — + // `ChatSession.pinnedMessageIDs` holds the pinned IDs — but the + // `CompressionPolicy.compress(history:sessionID:generate:)` / + // `PreTurnCompressionPolicy.compressBeforeTurn(...)` signatures pass only a + // `[ChatMessage]` and the `sessionID`, not the pinned-ID set. Threading the + // pins through is a PROTOCOL-SIGNATURE change (a new parameter or a session + // handle), so it is deliberately out of scope for this PR; see the note on + // the `CompressionPolicy` protocol doc. func isLoadBearing(_ message: ChatMessage) -> Bool { if message.role == .system { return true } if case .memory = message.kind { return true } diff --git a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift index a295a8ac..1883771f 100644 --- a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift +++ b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift @@ -7,27 +7,56 @@ import ManifoldInference /// (pre-turn) — so one value drives either, or both, without a bespoke /// conformance. /// -/// Construct via the strategy factories rather than the initializer: +/// Construct via the strategy factories rather than the initializer, and inject +/// the policy through ``ConversationRuntimeOptions`` (or the matching +/// `ConversationRuntime` init parameter) — there is no mutable +/// `runtime.compressionPolicy` property: /// /// ```swift /// // Zero-inference, smart selection. Compress at 75% context utilisation. -/// runtime.compressionPolicy = .extractive(threshold: 0.75, contextSize: 8_192) -/// -/// // Inference-backed summary of old turns, kept-recent tail. Pre-turn seam. -/// runtime.preTurnCompressionPolicy = .anchored(threshold: 0.85, contextSize: 8_192) +/// let options = ConversationRuntimeOptions( +/// compressionPolicy: .extractive(threshold: 0.75, contextSize: 8_192) +/// ) /// /// // Cheapest baseline: drop oldest, keep system/summary records + newest. -/// runtime.compressionPolicy = .truncating(threshold: 0.90, contextSize: 4_096) +/// let truncating = ConversationRuntimeOptions( +/// compressionPolicy: .truncating(threshold: 0.90, contextSize: 4_096) +/// ) +/// +/// // Inference-backed summary of old turns, kept-recent tail. Prefer the +/// // POST-turn seam: anchored runs a full summariser round-trip, and on the +/// // pre-turn seam that latency lands before the user's message even renders. +/// let anchored = ConversationRuntimeOptions( +/// compressionPolicy: .anchored(threshold: 0.85, contextSize: 8_192) +/// ) +/// +/// // For the PRE-turn seam, prefer a zero-inference strategy so turn setup +/// // pays no summariser latency: +/// let preTurn = ConversationRuntimeOptions( +/// preTurnCompressionPolicy: .extractive(threshold: 0.85, contextSize: 8_192) +/// ) /// ``` /// +/// ## Budget realism (seam constraint) +/// +/// The protocol `compress` signatures pass neither a system prompt nor a +/// tokenizer, so this policy holds both as configuration. `reservedTokens` +/// carves response headroom **plus** a system-prompt allowance out of +/// `contextSize`; the history budget is `contextSize − reservedTokens`. Pass a +/// real `tokenizer:` to the factories for a guaranteed budget — with +/// `tokenizer: nil` the budget is **advisory** (a chars/4 heuristic that can +/// diverge from the backend's real token count that drives the trigger). +/// /// ## Trigger asymmetry /// /// ``CompressionPolicy/shouldCompress(promptTokens:contextSize:contextUtilization:)`` /// receives utilisation directly. ``PreTurnCompressionPolicy`` does not — its /// `shouldCompressBeforeTurn` sees only `messageCount` and `lastPromptTokens`, /// so this policy stores `contextSize` and computes utilisation from -/// `lastPromptTokens / contextSize`. Given equivalent inputs the two seams -/// agree. +/// `lastPromptTokens / contextSize`. The two can disagree at the boundary by a +/// rounding margin: a post-turn caller that hands in an already-rounded +/// utilisation may cross the threshold while the pre-turn recompute from raw +/// `lastPromptTokens` stays just below it. public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPolicy { private let strategy: any CompressionStrategy @@ -36,17 +65,49 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol /// Backend context window (tokens). Held as configuration because the /// pre-turn seam does not pass it and the strategy needs it to size budget. public let contextSize: Int + /// Tokens reserved out of `contextSize` before history is sized: response + /// headroom + a system-prompt allowance (the real system prompt is not + /// visible to the strategy). Single source of truth for the reservation. + public let reservedTokens: Int private let tokenizer: (any TokenizerProvider)? + /// Default reservation when a caller doesn't override it. + /// + /// The legacy value was a bare `512`, which only covered a short response + /// and left **nothing** for the session system prompt (which is not part of + /// `history` and so can't be subtracted directly here). `2048` reserves a + /// generous response headroom plus a system-prompt allowance: it matches + /// the `maxOutputTokens ?? 2048` default that `GenerationQueue` / + /// `PromptAssembler` already reserve at the wire layer, so the trigger and + /// the budget agree on roughly the same headroom. Reasoning models that + /// emit thousands of thinking tokens should raise this further (see + /// ``scaledReservedTokens(forContextSize:base:)``) — a too-small reserve is + /// the classic thinking-model overflow. + public static let defaultReservedTokens = 2_048 + + /// A context-scaled reservation: never below `base`, and at least ~12.5% of + /// the window so large contexts leave proportional response/thinking + /// headroom. Capped at half the window so a tiny `base` can't starve + /// history on small contexts. Use this for reasoning models. + public static func scaledReservedTokens( + forContextSize contextSize: Int, + base: Int = defaultReservedTokens + ) -> Int { + let scaled = max(base, contextSize / 8) + return min(scaled, max(base, contextSize / 2)) + } + init( strategy: any CompressionStrategy, threshold: Double, contextSize: Int, + reservedTokens: Int, tokenizer: (any TokenizerProvider)? ) { self.strategy = strategy self.threshold = threshold self.contextSize = contextSize + self.reservedTokens = reservedTokens self.tokenizer = tokenizer } @@ -54,14 +115,22 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol /// Zero-inference sliding window: keep system/summary records + the newest /// messages that fit, drop the oldest. + /// + /// - Parameters: + /// - reservedTokens: Tokens carved from `contextSize` for response + + /// system-prompt headroom (default ``defaultReservedTokens``). + /// - tokenizer: Inject the backend's tokenizer for a guaranteed budget; + /// `nil` (default) makes the budget advisory (chars/4 heuristic). public static func truncating( threshold: Double = 0.90, contextSize: Int, + reservedTokens: Int = defaultReservedTokens, tokenizer: (any TokenizerProvider)? = nil ) -> DefaultCompressionPolicy { DefaultCompressionPolicy( strategy: TruncatingCompressionStrategy(), - threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer ) } @@ -72,33 +141,47 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol threshold: Double = 0.75, headBudgetFraction: Double = 0.0, contextSize: Int, + reservedTokens: Int = defaultReservedTokens, tokenizer: (any TokenizerProvider)? = nil ) -> DefaultCompressionPolicy { DefaultCompressionPolicy( strategy: ExtractiveCompressionStrategy(headBudgetFraction: headBudgetFraction), - threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer ) } /// Inference-backed summary of old turns prepended to a verbatim recent /// tail. Falls back to extractive when no summary can be produced. /// - /// - Parameter summarizerInputWindow: the summariser's REAL window, used to - /// size how much old text it reads — set this to the backend's true - /// context size when `contextSize` is a small overflow trigger. + /// Prefer the **post-turn** seam for anchored: it runs a full summariser + /// round-trip, and on the pre-turn seam that latency is paid before the + /// user's just-typed message renders. + /// + /// - Parameters: + /// - summarizerInputWindow: the summariser's REAL window, used to size how + /// much old text it reads — set this to the backend's true context size + /// when `contextSize` is a small overflow trigger. + /// - summaryTemplate: custom prompt. Note the coupling with + /// `parseSummaryResponse`: a custom template should emit + /// `UPPERCASE-FIELD: value` lines (≥2) or the parser degrades to a + /// raw-truncated brief. `{old_text}` is the substitution placeholder. public static func anchored( threshold: Double = 0.85, contextSize: Int, + reservedTokens: Int = defaultReservedTokens, summarizerInputWindow: Int? = nil, summaryTemplate: String? = nil, tokenizer: (any TokenizerProvider)? = nil ) -> DefaultCompressionPolicy { DefaultCompressionPolicy( strategy: AnchoredCompressionStrategy( + summarizerResponseBuffer: reservedTokens, summarizerInputWindow: summarizerInputWindow, summaryTemplate: summaryTemplate ), - threshold: threshold, contextSize: contextSize, tokenizer: tokenizer + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer ) } @@ -113,9 +196,19 @@ public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPol sessionID: UUID, generate: @Sendable ([ChatMessage]) async throws -> String ) async throws -> [ChatMessage] { - try await strategy.compress( + // Guard the degenerate window: if the reservation meets or exceeds the + // context the history budget is zero, and every pass would report + // "over budget" forever (the 512-token simulator cap is the canonical + // trap). Skip rather than churn. + guard contextSize > reservedTokens else { + Log.inference.warning( + "[Compression] contextSize \(contextSize) <= reservedTokens \(reservedTokens); skipping compression (no usable history budget)" + ) + return history + } + return try await strategy.compress( history: history, contextSize: contextSize, - tokenizer: tokenizer, generate: generate + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate ) } diff --git a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift index 8332a5c0..327fdc98 100644 --- a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift +++ b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift @@ -20,8 +20,20 @@ struct ExtractiveCompressionStrategy: CompressionStrategy { let headBudgetFraction: Double let recencyWeight: Double let lengthWeight: Double + /// Weight for capitalized-word density. NOTE: this signal assumes + /// English-like prose where proper nouns and sentence starts are + /// capitalized. It degrades on all-lowercase text, source code, and + /// non-cased scripts (CJK), where density trends to ~0 and the term simply + /// drops out. It is the smallest weight (0.2) precisely so it only *nudges* + /// selection rather than dominating it. let keywordDensityWeight: Double + /// Combined ceiling for the verbatim head + tail fractions. Past this the + /// pinned-verbatim core could equal or exceed the whole budget, leaving no + /// room for scored selection and risking an over-budget result before + /// scoring even runs. + static let maxVerbatimCoreFraction = 0.8 + init( tailBudgetFraction: Double = 0.40, headBudgetFraction: Double = 0.0, @@ -39,12 +51,13 @@ struct ExtractiveCompressionStrategy: CompressionStrategy { func compress( history: [ChatMessage], contextSize: Int, + reservedTokens: Int, tokenizer: (any TokenizerProvider)?, generate: @Sendable ([ChatMessage]) async throws -> String ) async throws -> [ChatMessage] { guard !history.isEmpty else { return [] } - let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } let originalTokens = tokens.reduce(0, +) @@ -63,8 +76,16 @@ struct ExtractiveCompressionStrategy: CompressionStrategy { used += tokens[i] } + // Clamp the verbatim core (head + tail) so the two reserved bands can't + // jointly claim the whole budget and leave nothing for scored selection + // (or overflow it before scoring runs). Tail keeps priority; head takes + // whatever remains under the ceiling. + let coreFraction = min(Self.maxVerbatimCoreFraction, tailBudgetFraction + headBudgetFraction) + let effectiveTailFraction = min(tailBudgetFraction, coreFraction) + let effectiveHeadFraction = max(0.0, coreFraction - effectiveTailFraction) + // --- Verbatim tail (newest) --- - let tailBudget = Int(Double(budget) * tailBudgetFraction) + let tailBudget = Int(Double(budget) * effectiveTailFraction) var tailUsed = 0 for i in stride(from: count - 1, through: 0, by: -1) { if keep.contains(i) { continue } @@ -82,8 +103,8 @@ struct ExtractiveCompressionStrategy: CompressionStrategy { } // --- Verbatim head (oldest) — anti "lost in the middle" --- - if headBudgetFraction > 0 { - let headBudget = Int(Double(budget) * headBudgetFraction) + if effectiveHeadFraction > 0 { + let headBudget = Int(Double(budget) * effectiveHeadFraction) var headUsed = 0 for i in 0.. budget { + let newest = count - 1 + for i in 0.. String ) async throws -> [ChatMessage] { guard !history.isEmpty else { return [] } - let budget = historyBudget(contextSize: contextSize, tokenizer: tokenizer) + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) if estimateTokens(history, tokenizer: tokenizer) <= budget { return history } diff --git a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift index f23321ca..216ecac1 100644 --- a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift +++ b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift @@ -32,6 +32,68 @@ final class ContextWindowManagerTests: XCTestCase { XCTAssertEqual(ContextWindowManager.estimateTokenCount("a"), 1) } + // MARK: - Per-part / per-message estimation (#1885 finding 9) + + func test_estimateTokenCount_textPart_matchesStringOverload() { + let text = String(repeating: "a", count: 100) // 25 tokens + XCTAssertEqual(ContextWindowManager.estimateTokenCount(MessagePart.text(text)), + ContextWindowManager.estimateTokenCount(text)) + } + + func test_estimateTokenCount_imagePart_isNonZeroFixed() { + let part = MessagePart.image(data: Data(repeating: 0, count: 8), mimeType: "image/png") + XCTAssertEqual(ContextWindowManager.estimateTokenCount(part), + ContextWindowManager.imagePartTokenEstimate) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0, + "image must not estimate as zero") + } + + func test_estimateTokenCount_audioPart_isNonZeroFixed() { + let part = MessagePart.audio(url: URL(fileURLWithPath: "/tmp/a.m4a"), duration: 3, waveform: nil) + XCTAssertEqual(ContextWindowManager.estimateTokenCount(part), + ContextWindowManager.audioPartTokenEstimate) + } + + func test_estimateTokenCount_toolCallPart_countsNameAndArgs() { + let part = MessagePart.toolCall(ToolCall( + id: "c1", toolName: "get_weather", + arguments: String(repeating: "x", count: 80))) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0, + "tool call args must be counted") + } + + func test_estimateTokenCount_toolResultPart_countsPayload() { + let part = MessagePart.toolResult(ToolResult( + callId: "c1", content: String(repeating: "y", count: 120))) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0) + } + + /// The crux: a message with ONLY non-text parts used to estimate as zero + /// via `.content`. The per-message estimate must be non-zero. + func test_estimateTokenCount_imageOnlyMessage_isNonZero() { + let msg = ChatMessage( + role: .user, + contentParts: [.image(data: Data(repeating: 0, count: 8), mimeType: "image/png")], + sessionID: UUID()) + XCTAssertEqual(msg.content, "", "precondition: .content is empty for image-only message") + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(msg), 0, + "image-only message must not estimate as zero") + } + + func test_estimateTokenCount_message_sumsAcrossParts() { + let text = String(repeating: "a", count: 100) // 25 tokens + let msg = ChatMessage( + role: .user, + contentParts: [ + .text(text), + .image(data: Data(repeating: 0, count: 8), mimeType: "image/png"), + ], + sessionID: UUID()) + let expected = ContextWindowManager.estimateTokenCount(text) + + ContextWindowManager.imagePartTokenEstimate + XCTAssertEqual(ContextWindowManager.estimateTokenCount(msg), expected) + } + // MARK: - Context Size Resolution func test_resolveContextSize_sessionOverrideTakesPriority() { diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift index cc9aaf3f..3ffb32d5 100644 --- a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift +++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift @@ -23,12 +23,23 @@ final class DefaultCompressionPolicyTests: XCTestCase { (0.. Int { max(0, contextSize - 512) } + private let reservedTokens = 512 + private func budget() -> Int { max(0, contextSize - reservedTokens) } private func tokens(_ messages: [ChatMessage]) -> Int { - messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1.content, tokenizer: nil) } + messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: nil) } + } + + /// A real (non-nil) tokenizer to exercise the tokenizer-injected path. + /// Counts whitespace-separated words — deterministic and != the chars/4 + /// heuristic, so a test that passes it really takes the tokenizer branch. + private struct WordTokenizer: TokenizerProvider { + func tokenCount(_ text: String) -> Int { + max(1, text.split(whereSeparator: { $0.isWhitespace }).count) + } } private static func echoGenerate(_: [ChatMessage]) async throws -> String { @@ -40,7 +51,8 @@ final class DefaultCompressionPolicyTests: XCTestCase { func testTruncatingLeavesSmallHistoryUntouched() async throws { let history = [msg(.user, words: 5), msg(.assistant, words: 5)] let out = try await TruncatingCompressionStrategy().compress( - history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) XCTAssertEqual(out.map(\.id), history.map(\.id)) } @@ -49,7 +61,8 @@ final class DefaultCompressionPolicyTests: XCTestCase { XCTAssertGreaterThan(tokens(history), budget()) // precondition: actually overflows let out = try await TruncatingCompressionStrategy().compress( - history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) XCTAssertLessThan(out.count, history.count, "expected eviction") XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive") @@ -70,7 +83,8 @@ final class DefaultCompressionPolicyTests: XCTestCase { XCTAssertGreaterThan(tokens([history[0]]), budget(), "precondition: load-bearing alone overflows") let out = try await TruncatingCompressionStrategy().compress( - history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive even when load-bearing fills the budget") XCTAssertTrue(out.contains { $0.role == .system }, "load-bearing record retained") @@ -82,31 +96,51 @@ final class DefaultCompressionPolicyTests: XCTestCase { history.append(contentsOf: overflowingHistory()) let out = try await TruncatingCompressionStrategy().compress( - history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) XCTAssertTrue(out.contains { $0.role == .system }, "system prompt must survive") XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }, "prior summary must survive") } + /// Real-tokenizer path: a deterministic word tokenizer (not chars/4) must + /// still reduce below the budget it computes. + func testTruncatingWithRealTokenizerReducesBelowBudget() async throws { + let tok = WordTokenizer() + let history = overflowingHistory(turns: 20, words: 200) + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: tok, generate: { _ in "" }) + let usedWords = out.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: tok) } + XCTAssertLessThanOrEqual(usedWords, budget()) + XCTAssertEqual(out.last?.id, history.last?.id) + } + // MARK: - Extractive func testExtractiveReducesBelowBudgetAndKeepsNewest() async throws { let history = overflowingHistory() let out = try await ExtractiveCompressionStrategy().compress( - history: history, contextSize: contextSize, tokenizer: nil, generate: { _ in "" }) + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) XCTAssertLessThanOrEqual(tokens(out), budget()) XCTAssertEqual(out.last?.id, history.last?.id) - XCTAssertEqual(out.map(\.id), out.map(\.id).sorted { a, b in - (history.firstIndex { $0.id == a } ?? 0) < (history.firstIndex { $0.id == b } ?? 0) - }, "output must stay chronological") + // Output must be strictly increasing in original history index — guards + // chronological order against re-ordering (not a self-sort tautology). + let indices = out.map { m in history.firstIndex { $0.id == m.id }! } + XCTAssertEqual(indices, indices.sorted(), "output indices must be sorted") + for i in 1..` scratchpad must not appear in the summary record. + func testAnchoredStripsLeakedThinkingFromSummary() async throws { + let history = overflowingHistory() + let leaky: @Sendable ([ChatMessage]) async throws -> String = { _ in + "I should mention SECRET_LEAK while reasoning\nTOPIC: testing\nKEY POINTS: a; b; c" + } + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: leaky) + let summary = try XCTUnwrap(out.first) + XCTAssertFalse(summary.content.contains("SECRET_LEAK"), "thinking must be stripped") + XCTAssertFalse(summary.content.contains("")) + XCTAssertTrue(summary.content.contains("TOPIC"), "visible fields survive") + } + + /// Chunk-and-fold: oldText exceeds the usable summariser input window, so + /// the strategy chunks. `generate` records every prompt; assert ≥2 calls + /// and that content from the OLDEST chunk is represented in the brief. + func testAnchoredChunkAndFold() async throws { + actor PromptRecorder { + var prompts: [String] = [] + func record(_ p: String) { prompts.append(p) } + } + let recorder = PromptRecorder() + let history = overflowingHistory(turns: 30, words: 120) + // First message carries a unique marker we can trace to the oldest chunk. + var tagged = history + tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID) + + let generate: @Sendable ([ChatMessage]) async throws -> String = { msgs in + let prompt = msgs.first?.content ?? "" + await recorder.record(prompt) + // Echo back any marker the chunk contained so it reaches the fold. + if prompt.contains("OLDEST_MARKER") { + return "TOPIC: oldest\nKEY POINTS: OLDEST_MARKER seen; b; c" + } + return "TOPIC: chunk\nKEY POINTS: a; b; c" + } + + // summarizerInputWindow small enough that old text exceeds the usable budget. + let out = try await AnchoredCompressionStrategy( + summarizerResponseBuffer: 64, summarizerInputWindow: 600 + ).compress( + history: tagged, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: generate) + + let calls = await recorder.prompts + XCTAssertGreaterThanOrEqual(calls.count, 2, "chunking should produce ≥2 generate calls (chunks + fold)") + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary") } + XCTAssertTrue(summary.content.contains("OLDEST_MARKER") || calls.contains { $0.contains("OLDEST_MARKER") }, + "oldest chunk content must be represented") + } + + /// Chunk-failure: the FIRST chunk's `generate` throws → that chunk's raw + /// content is preserved via the truncated-text fallback (not lost), and the + /// overall compression still produces a `.memory` summary because the fold + /// (a later call) succeeds. + func testAnchoredChunkFailurePreservesContent() async throws { + actor CallCounter { var n = 0; func next() -> Int { n += 1; return n } } + let counter = CallCounter() + let history = overflowingHistory(turns: 30, words: 120) + var tagged = history + tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID) + + // Throw on the first generate call (a chunk), succeed on all later calls + // (remaining chunks + the fold). The first chunk's raw text falls back + // via truncateToFit so its content is not dropped. + let generate: @Sendable ([ChatMessage]) async throws -> String = { _ in + struct ChunkBoom: Error {} + if await counter.next() == 1 { throw ChunkBoom() } + return "TOPIC: chunk\nKEY POINTS: a; b; c" + } + let out = try await AnchoredCompressionStrategy( + summarizerResponseBuffer: 64, summarizerInputWindow: 600 + ).compress( + history: tagged, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: generate) + // The summary record exists (top-level summarise succeeded on the fold). + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary despite chunk failure") } + let totalCalls = await counter.n + XCTAssertGreaterThanOrEqual(totalCalls, 2, "should retry remaining chunks + fold after one chunk failed") + XCTAssertFalse(out.isEmpty) + } + + /// Summary-floor: the tail consumes ~the whole budget, leaving no room for + /// the summary. The strategy must still emit a non-empty `.memory` record + /// AND keep the result within budget. + func testAnchoredSummaryFloor() async throws { + // tailBudgetFraction 0.95 → tail eats almost all budget. + let history = overflowingHistory(turns: 20, words: 120) + let out = try await AnchoredCompressionStrategy(tailBudgetFraction: 0.95).compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: Self.echoGenerate) + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected floored memory summary") } + XCTAssertFalse(summary.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty, + "summary floor must produce a non-empty brief") + XCTAssertLessThanOrEqual(tokens(out), budget()) + } + + /// Cancellation mid-summarise: cancelling the surrounding Task returns the + /// tail with NO summary record. + func testAnchoredCancellationMidSummarize() async throws { + let history = overflowingHistory() + // Hoist instance properties to locals so the Task closure doesn't + // capture (non-Sendable) `self`. + let ctx = contextSize + let reserve = reservedTokens + let task = Task { () -> [ChatMessage] in + try await AnchoredCompressionStrategy().compress( + history: history, contextSize: ctx, reservedTokens: reserve, + tokenizer: nil, + generate: { _ in + // Yield so cancellation lands before/within summarise. + try await Task.sleep(nanoseconds: 50_000_000) + return "TOPIC: x\nKEY POINTS: a; b" + }) + } + task.cancel() + let out = try await task.value + XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false }, + "cancellation must not inject a summary record") + XCTAssertEqual(out.last?.id, history.last?.id, "tail preserved on cancel") + } + + /// `parseSummaryResponse` <2-field raw-fallback branch: a summary with only + /// one recognisable field degrades to the trimmed raw response, not an + /// empty/placeholder brief. + func testAnchoredSingleFieldRawFallback() async throws { + let history = overflowingHistory() + let oneField: @Sendable ([ChatMessage]) async throws -> String = { _ in + "TOPIC: only one field here and some prose that should survive verbatim" + } + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: oneField) + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary") } + XCTAssertTrue(summary.content.contains("prose that should survive"), + "single-field response degrades to trimmed raw text") + } + // MARK: - Policy thresholds & seam agreement func testShouldCompressHonorsThreshold() { @@ -179,26 +418,42 @@ final class DefaultCompressionPolicyTests: XCTestCase { "unknown context size never compresses") } - func testPreTurnAndPostTurnTriggersAgree() { + /// Trigger boundary, hand-computed. Both seams fire at exactly the + /// threshold and decline just below it; and the rounding asymmetry where a + /// post-turn caller passing a rounded utilisation fires while the pre-turn + /// recompute from raw tokens stays below. + func testTriggerAsymmetryBoundary() { let threshold = 0.80 let policy = DefaultCompressionPolicy.truncating(threshold: threshold, contextSize: contextSize) - // Equivalent inputs: promptTokens = 0.85 * contextSize → both should fire. - let promptTokens = Int(0.85 * Double(contextSize)) - let utilization = Double(promptTokens) / Double(contextSize) - - let postTurn = policy.shouldCompress( - promptTokens: promptTokens, contextSize: contextSize, contextUtilization: utilization) - let preTurn = policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: promptTokens) - XCTAssertEqual(preTurn, postTurn) - XCTAssertTrue(preTurn) - - // Below threshold: both decline. - let lowTokens = Int(0.50 * Double(contextSize)) - XCTAssertEqual( - policy.shouldCompressBeforeTurn(messageCount: 99, lastPromptTokens: lowTokens), - policy.shouldCompress(promptTokens: lowTokens, contextSize: contextSize, - contextUtilization: Double(lowTokens) / Double(contextSize))) + // promptTokens chosen so utilisation == threshold exactly. + let atTokens = Int(threshold * Double(contextSize)) // 0.80 * 2048 = 1638 (1638/2048 = 0.7998…) + // At the exact integer-token boundary the recomputed utilisation may be + // a hair below threshold; assert both seams agree with the hand-computed + // recompute regardless. + let atUtil = Double(atTokens) / Double(contextSize) + let preAt = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: atTokens) + let postAt = policy.shouldCompress(promptTokens: atTokens, contextSize: contextSize, contextUtilization: atUtil) + XCTAssertEqual(preAt, postAt) + XCTAssertEqual(preAt, atUtil >= threshold, "pre-turn matches hand-computed bool at boundary") + + // threshold − epsilon: definitively below → both decline. + let belowTokens = Int((threshold - 0.01) * Double(contextSize)) + let belowUtil = Double(belowTokens) / Double(contextSize) + XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: belowTokens)) + XCTAssertFalse(policy.shouldCompress(promptTokens: belowTokens, contextSize: contextSize, contextUtilization: belowUtil)) + + // Rounding asymmetry: a post-turn caller that ROUNDS utilisation up to + // the threshold fires, while the pre-turn recompute from raw tokens + // (which is just under) does not. Demonstrates the documented seam gap. + let justUnderTokens = Int(threshold * Double(contextSize)) - 1 // 1637/2048 = 0.79931 < 0.80 + let recomputed = Double(justUnderTokens) / Double(contextSize) + XCTAssertLessThan(recomputed, threshold, "raw recompute is below threshold") + let preJustUnder = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: justUnderTokens) + let postRounded = policy.shouldCompress(promptTokens: justUnderTokens, contextSize: contextSize, + contextUtilization: threshold) // caller passes rounded value + XCTAssertFalse(preJustUnder, "pre-turn recompute declines just below threshold") + XCTAssertTrue(postRounded, "post-turn fires when caller passes a utilisation already at threshold") } func testPreTurnWithoutPriorTokensDoesNotCompress() { @@ -207,8 +462,11 @@ final class DefaultCompressionPolicyTests: XCTestCase { } func testPolicyCompressDelegatesToStrategy() async throws { - let history = overflowingHistory() - let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize) + // Context comfortably above the default 2048 reserve so the policy's + // small-window guard doesn't skip; history overflows the resulting budget. + let largeContext = 8_192 + let history = overflowingHistory(turns: 80, words: 120) + let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: largeContext) let out = try await policy.compress(history: history, sessionID: sessionID, generate: Self.echoGenerate) XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }) @@ -216,4 +474,29 @@ final class DefaultCompressionPolicyTests: XCTestCase { let preOut = try await policy.compressBeforeTurn(history: history, sessionID: sessionID, generate: Self.echoGenerate) XCTAssertEqual(preOut.first?.kind.rawStorage, out.first?.kind.rawStorage) } + + /// Finding 1 guard: a context window at or below the reservation has no + /// usable history budget; the policy must skip compression (return history + /// unchanged) rather than churn against a zero/negative budget. + func testPolicySkipsWhenContextSmallerThanReserve() async throws { + // 512-token simulator cap with the default 2048 reserve. + let policy = DefaultCompressionPolicy.truncating(contextSize: 512) + let history = overflowingHistory() + let out = try await policy.compress(history: history, sessionID: sessionID, generate: { _ in "" }) + XCTAssertEqual(out.map(\.id), history.map(\.id), "no usable budget → history unchanged") + } + + /// The default reserve is clearly larger than the legacy 512. + func testDefaultReserveIsLargerThanLegacy() { + XCTAssertGreaterThan(DefaultCompressionPolicy.defaultReservedTokens, 512) + } + + /// Context-scaled reserve grows with the window but stays under half of it. + func testScaledReserveBounds() { + let small = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 4_096) + XCTAssertGreaterThanOrEqual(small, DefaultCompressionPolicy.defaultReservedTokens) + let large = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 131_072) + XCTAssertEqual(large, 131_072 / 8, "scales to ~12.5% of a big window") + XCTAssertLessThanOrEqual(large, 131_072 / 2, "never exceeds half the window") + } }