diff --git a/Sources/ManifoldInference/Services/ContextWindowManager.swift b/Sources/ManifoldInference/Services/ContextWindowManager.swift index c70aa2982..c04a09e66 100644 --- a/Sources/ManifoldInference/Services/ContextWindowManager.swift +++ b/Sources/ManifoldInference/Services/ContextWindowManager.swift @@ -71,6 +71,60 @@ public enum ContextWindowManager { return HeuristicTokenizer().tokenCount(text) } + /// Fixed per-part token cost for a non-text modality whose true token + /// footprint isn't visible from the `[ChatMessage]` value alone. + /// + /// `.content` only exposes `.text` parts, so an image/audio part would + /// otherwise estimate as **zero** — a multimodal turn could then sail past + /// the compression trigger and overflow at generation. These are coarse + /// placeholders, not provider-accurate counts (a single image is ~85–1.5k + /// tokens depending on resolution / provider), deliberately sized to be + /// "clearly non-zero, never catastrophically wrong": they make the budget + /// math notice the part exists. A backend that knows its real tiling cost + /// should account for that at the wire layer, not here. + static let imagePartTokenEstimate = 768 + static let audioPartTokenEstimate = 384 + static let generatedMediaPartTokenEstimate = 256 + + /// Estimates the token cost of a single `MessagePart`. + /// + /// - `.text` / `.thinking`: counted via the tokenizer (heuristic when nil). + /// - `.image` / `.audio` / `.generatedMedia`: a documented fixed estimate + /// (``imagePartTokenEstimate`` etc.) since the byte payload's true token + /// footprint isn't derivable here. + /// - `.toolCall`: the tool name plus its serialized JSON arguments. + /// - `.toolResult`: the serialized result payload (content + dialog). + public static func estimateTokenCount(_ part: MessagePart, tokenizer: TokenizerProvider? = nil) -> Int { + switch part { + case .text(let t): + return estimateTokenCount(t, tokenizer: tokenizer) + case .thinking(let t, _): + return estimateTokenCount(t, tokenizer: tokenizer) + case .image: + return imagePartTokenEstimate + case .audio: + return audioPartTokenEstimate + case .generatedMedia: + return generatedMediaPartTokenEstimate + case .toolCall(let call): + return estimateTokenCount("\(call.toolName) \(call.arguments)", tokenizer: tokenizer) + case .toolResult(let result): + let payload = [result.content, result.dialog].compactMap { $0 }.joined(separator: " ") + return estimateTokenCount(payload, tokenizer: tokenizer) + } + } + + /// Estimates the token cost of a whole message by summing every content + /// part — text, reasoning, multimodal, and tool parts alike. + /// + /// Prefer this over `estimateTokenCount(message.content, …)`: `.content` + /// discards everything but `.text` parts, so the string overload silently + /// under-counts (to zero) image/audio/tool-only messages and lets them + /// overflow the window. Sums across `contentParts` instead. + public static func estimateTokenCount(_ message: ChatMessage, tokenizer: TokenizerProvider? = nil) -> Int { + message.contentParts.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) } + } + /// Resolves the effective context size from available sources. /// /// Priority: session override > model metadata > backend capabilities > default. @@ -121,7 +175,8 @@ public enum ContextWindowManager { var usedTokens = 0 for i in stride(from: messages.count - 1, through: 0, by: -1) { - let messageTokens = estimateTokenCount(messages[i].content, tokenizer: tokenizer) + // Sum across all parts — `.content` would miss image/audio/tool parts. + let messageTokens = estimateTokenCount(messages[i], tokenizer: tokenizer) if usedTokens + messageTokens > available && firstKeptIndex < messages.endIndex { break } @@ -141,7 +196,7 @@ public enum ContextWindowManager { tokenizer: TokenizerProvider? = nil ) -> ContextBudget { let systemTokens = estimateTokenCount(systemPrompt ?? "", tokenizer: tokenizer) - let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1.content, tokenizer: tokenizer) } + let messageTokens = messages.reduce(0) { $0 + estimateTokenCount($1, tokenizer: tokenizer) } let availableForHistory = maxTokens - systemTokens - responseBuffer return ContextBudget( diff --git a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift index cdd303c6e..314a92122 100644 --- a/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift +++ b/Sources/ManifoldRuntime/Protocols/CompressionPolicy.swift @@ -11,6 +11,17 @@ import ManifoldInference /// Compression failures are logged and do not abort the turn — the existing /// history is preserved. /// +/// ## Per-message pins are not yet threaded through this seam +/// +/// `compress(history:sessionID:generate:)` passes only a `[ChatMessage]` and +/// the `sessionID` — **not** the set of user-pinned message IDs. The data +/// already exists (`ChatSession.pinnedMessageIDsRaw` / `pinnedMessageIDs` on +/// the session record), but honoring pins inside a policy requires a +/// protocol-signature change to carry the pinned-ID set (a new parameter or a +/// session handle). Until that lands, ``DefaultCompressionPolicy`` treats only +/// `.system`-role and `.memory`-kind records as load-bearing; explicit +/// per-message pins are not preserved across compression. +/// /// ## v0.26.0 Migration /// /// The `shouldCompress` signature gained a `contextUtilization` parameter in v0.26.0. diff --git a/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift new file mode 100644 index 000000000..470232ec4 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/AnchoredCompressionStrategy.swift @@ -0,0 +1,316 @@ +import Foundation +import ManifoldInference + +/// Summarises old messages via an inference call, then prepends the summary +/// (as a `.memory("summary")` record) to a verbatim recency tail. Falls back +/// to ``ExtractiveCompressionStrategy`` on any failure — failed or empty +/// inference (including a no-op summariser), or an oversized summary. +/// +/// Adapted from Fireside's `AnchoredStoryCompressor`, preserving its +/// hard-won behaviors: input-window decoupling (`summarizerInputWindow`), +/// chunk-and-fold for over-window input, a minimum-summary floor, and a +/// cancellation early-return. The summary prompt is passed to `generate` as a +/// single-message mini-conversation. +/// +// TODO(#1885, optional P2): collapse/fold a prior `.memory("summary")` record +// into the new summarisation pass instead of pinning it (as load-bearing) into +// the tail and prepending a SECOND inline summary each cycle. Across many +// compression cycles the inline summary blocks stack. Doing this right means +// detecting the prior summary, excluding it from the verbatim tail, and +// feeding its text into the summariser input alongside the old messages — a +// non-trivial change to the tail/old-message partition. Deliberately deferred +// to keep this fix-round diff bounded; the stack is bounded in practice by the +// budget enforcement and is correctness-neutral (just less compact). +struct AnchoredCompressionStrategy: CompressionStrategy { + let name = "anchored" + + let tailBudgetFraction: Double + /// The summariser model's REAL usable window (tokens), used to size how + /// much old text it may READ. Decoupled from `contextSize`, which is the + /// small overflow *trigger* and the *output-brief* budget. `nil` sizes + /// input against `contextSize` (legacy behavior). + let summarizerInputWindow: Int? + /// Minimum tokens reserved for the output brief even when `contextSize` is + /// tiny — a short brief beats no brief. + let minSummaryBudget: Int + /// Tokens reserved for the summariser's own response when sizing the INPUT + /// window. This is the SAME reservation knob as the policy's + /// `reservedTokens` (the factory threads `reservedTokens` here), so there is + /// one source of truth — there is no longer a separate hard-coded buffer. + let summarizerResponseBuffer: Int + let summaryTemplate: String + + private let fallback = ExtractiveCompressionStrategy() + + /// Placeholder `{old_text}` is replaced with the concatenated old messages. + static let defaultSummaryTemplate = """ + Summarize the conversation so far. Be concise. Use only what is in the text. + + TOPIC: [main subject of the conversation, brief] + KEY POINTS: [up to 3 important points, semicolon-separated] + OPEN QUESTIONS: [unresolved items or pending decisions, if any] + LAST DISCUSSED: [most recent topic or conclusion, one sentence] + + Conversation: + {old_text} + """ + + init( + tailBudgetFraction: Double = 0.50, + summarizerResponseBuffer: Int = DefaultCompressionPolicy.defaultReservedTokens, + summarizerInputWindow: Int? = nil, + minSummaryBudget: Int = 256, + summaryTemplate: String? = nil + ) { + self.tailBudgetFraction = tailBudgetFraction + self.summarizerInputWindow = summarizerInputWindow + self.minSummaryBudget = minSummaryBudget + self.summarizerResponseBuffer = summarizerResponseBuffer + self.summaryTemplate = summaryTemplate ?? Self.defaultSummaryTemplate + } + + func compress( + history: [ChatMessage], + contextSize: Int, + reservedTokens: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) + let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } + let originalTokens = tokens.reduce(0, +) + if originalTokens <= budget { + return history + } + + let sessionID = history.first?.sessionID ?? UUID() + let count = history.count + + // --- Verbatim tail: load-bearing records + newest within tailBudget --- + let tailBudget = Int(Double(budget) * tailBudgetFraction) + var tailIndices = Set() + var tailTokens = 0 + for i in 0.. usableInputBudget && usableInputBudget > 0 { + Log.inference.warning( + "[AnchoredCompression] input \(oldTextTokens) tok exceeds usable window \(usableInputBudget); chunk-and-folding \(oldMessages.count) old messages" + ) + prompt = await foldedSummaryPrompt( + oldMessages: oldMessages, chunkBudget: usableInputBudget, + sessionID: sessionID, generate: generate, tokenizer: tokenizer + ) + } else { + prompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: oldText) + } + + // --- Summarise (cancellation returns a minimal tail-only result) --- + let summaryText: String + do { + try Task.checkCancellation() + summaryText = try await generate([summaryMessage(prompt, sessionID: sessionID)]) + } catch is CancellationError { + return tailMessages + } catch { + Log.inference.debug("[AnchoredCompression] summarisation failed: \(error); falling back to extractive") + return try await fallback.compress( + history: history, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate + ) + } + + guard !summaryText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + return try await fallback.compress( + history: history, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate + ) + } + + // --- Assemble: summary record + verbatim tail --- + let parsed = parseSummaryResponse(summaryText) + let summaryTokens = ContextWindowManager.estimateTokenCount(parsed, tokenizer: tokenizer) + + var finalSummary = parsed + if summaryTokens + tailTokens > budget { + let rawSummaryBudget = budget - tailTokens + let summaryBudget = rawSummaryBudget > 0 ? rawSummaryBudget : min(minSummaryBudget, budget) + if rawSummaryBudget <= 0 { + Log.inference.warning( + "[AnchoredCompression] tail consumes the whole budget (tail=\(tailTokens) >= budget=\(budget)); emitting floored \(summaryBudget)-tok brief" + ) + } + let truncated = truncateToFit(parsed, budget: summaryBudget, tokenizer: tokenizer) + finalSummary = truncated.isEmpty ? String(parsed.prefix(200)) : truncated + } + + var output: [ChatMessage] = [summaryRecord(finalSummary, sessionID: sessionID)] + output.append(contentsOf: tailMessages) + return output + } + + // MARK: - Helpers + + private func summaryMessage(_ prompt: String, sessionID: UUID) -> ChatMessage { + ChatMessage(role: .user, content: prompt, sessionID: sessionID) + } + + private func summaryRecord(_ text: String, sessionID: UUID) -> ChatMessage { + ChatMessage(role: .system, content: text, sessionID: sessionID, kind: .memory("summary")) + } + + /// Strips leaked chain-of-thought (``, `…`) + /// before parsing, reusing MK's ``ThinkingTransform`` rather than + /// hand-rolling marker logic. A reasoning model can emit its scratchpad + /// ahead of the brief; left in, that scratchpad would be parsed as the + /// "summary" and injected verbatim into the compressed history. We run the + /// two common marker families (Qwen/DeepSeek ``, Mistral/Sky-T1 + /// ``) in sequence and keep only the visible `.token` text. + private func stripThinking(_ text: String) -> String { + var result = text + for markers in [ThinkingMarkers.qwen3, ThinkingMarkers.mistralReasoning] { + var transform = ThinkingTransform(markers: markers) + var events = transform.process([.token(result)]) + events += transform.finalize() + let visible = events.compactMap { event -> String? in + if case .token(let t) = event { return t } + return nil + }.joined() + result = visible + } + return result + } + + /// Extracts `FIELD: value` lines and reassembles them; falls back to a + /// trimmed raw response when fewer than two fields are present. Strips + /// leaked reasoning first so chain-of-thought can't masquerade as a summary. + private func parseSummaryResponse(_ rawResponse: String) -> String { + let response = stripThinking(rawResponse) + let pattern = "^([A-Z][A-Z _]*[A-Z]):\\s*(.+)$" + let regex: NSRegularExpression + do { + regex = try NSRegularExpression(pattern: pattern, options: [.anchorsMatchLines, .caseInsensitive]) + } catch { + // The pattern is a compile-time constant; a failure here means a + // toolchain regression, not bad input. Surface it and fall back to + // the trimmed raw response rather than swallowing it silently. + Log.inference.error("[AnchoredCompression] summary regex failed to compile: \(error)") + let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400)) + } + let ns = response as NSString + var fields: [String] = [] + for match in regex.matches(in: response, range: NSRange(location: 0, length: ns.length)) where match.numberOfRanges >= 3 { + let name = ns.substring(with: match.range(at: 1)) + let value = ns.substring(with: match.range(at: 2)).trimmingCharacters(in: .whitespaces) + if !value.isEmpty { fields.append("\(name): \(value)") } + } + if fields.count >= 2 { return fields.joined(separator: "\n") } + let trimmed = response.trimmingCharacters(in: .whitespacesAndNewlines) + return trimmed.isEmpty ? "[Summary unavailable]" : String(trimmed.prefix(400)) + } + + /// Summarises over-window input in chunks, then folds the chunk summaries + /// into one prompt so the WHOLE old text is covered rather than truncated. + private func foldedSummaryPrompt( + oldMessages: [ChatMessage], + chunkBudget: Int, + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String, + tokenizer: (any TokenizerProvider)? + ) async -> String { + let templateOverhead = ContextWindowManager.estimateTokenCount( + summaryTemplate.replacingOccurrences(of: "{old_text}", with: ""), + tokenizer: tokenizer + ) + let perChunkBudget = max(64, chunkBudget - templateOverhead) + + var chunks: [[ChatMessage]] = [] + var current: [ChatMessage] = [] + var currentTokens = 0 + for message in oldMessages { + let cost = estimateTokens(message, tokenizer: tokenizer) + if !current.isEmpty && currentTokens + cost > perChunkBudget { + chunks.append(current) + current = [] + currentTokens = 0 + } + current.append(message) + currentTokens += cost + } + if !current.isEmpty { chunks.append(current) } + + var summaries: [String] = [] + for chunk in chunks { + let chunkText = chunk.map(\.content).joined(separator: "\n\n") + let chunkPrompt = summaryTemplate.replacingOccurrences(of: "{old_text}", with: chunkText) + do { + let summary = try await generate([summaryMessage(chunkPrompt, sessionID: sessionID)]) + if summary.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer)) + } else { + summaries.append(parseSummaryResponse(summary)) + } + } catch { + // Preserve content even when a chunk's summarisation fails. + Log.inference.debug("[AnchoredCompression] chunk summarisation failed: \(error); keeping raw chunk") + summaries.append(truncateToFit(chunkText, budget: perChunkBudget, tokenizer: tokenizer)) + } + } + return summaryTemplate.replacingOccurrences(of: "{old_text}", with: summaries.joined(separator: "\n\n")) + } + + /// Drops trailing words until the text fits the budget. + private func truncateToFit(_ text: String, budget: Int, tokenizer: (any TokenizerProvider)?) -> String { + if ContextWindowManager.estimateTokenCount(text, tokenizer: tokenizer) <= budget { + return text + } + var words = text.split(separator: " ", omittingEmptySubsequences: true) + while !words.isEmpty { + words.removeLast() + let candidate = words.joined(separator: " ") + if ContextWindowManager.estimateTokenCount(candidate, tokenizer: tokenizer) <= budget { + return candidate + } + } + return "" + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift new file mode 100644 index 000000000..ab99e8a34 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/CompressionStrategy.swift @@ -0,0 +1,105 @@ +import Foundation +import ManifoldInference + +/// A pure, stateless algorithm that reduces a chat history to fit a token +/// budget. Strategies are the reusable core shared by both compression seams +/// (``CompressionPolicy`` post-turn and ``PreTurnCompressionPolicy`` +/// pre-turn); ``DefaultCompressionPolicy`` wraps a strategy with the trigger +/// thresholds each protocol needs. +/// +/// Unlike the ``CompressionPolicy``/``PreTurnCompressionPolicy`` `compress` +/// methods — which receive only `history` + `sessionID` — a strategy also +/// takes the `contextSize` and `tokenizer` it sizes against. The policy holds +/// those as configuration and forwards them, because the protocol `compress` +/// signatures intentionally do not. +/// +/// ## System-prompt budgeting (seam constraint) +/// +/// A strategy sees only the `history` array. The session system prompt lives +/// on `ChatSession.systemPrompt` and is *not* part of `history`, and the +/// `CompressionPolicy` / `PreTurnCompressionPolicy` protocol `compress` +/// signatures pass **neither** a system prompt nor a tokenizer — so a strategy +/// cannot subtract a *real* system-prompt token count and cannot read a live +/// tokenizer. Both are therefore **configuration** on ``DefaultCompressionPolicy`` +/// (set via the factories) and forwarded in: `reservedTokens` (response +/// headroom + a system-prompt allowance) and an optional `tokenizer`. The +/// budget is `contextSize − reservedTokens`. Any `.system`-role or `.memory`-kind +/// records that *are* in `history` are treated as load-bearing and preserved +/// verbatim by every strategy. +/// +/// Strategies are `Sendable` value types: `generate` is a parameter, never +/// stored, so there is no mutable summariser handle to guard. +protocol CompressionStrategy: Sendable { + /// Stable identifier recorded for diagnostics (e.g. `"truncating"`). + var name: String { get } + + /// Reduce `history` to fit `contextSize`. + /// + /// - Parameters: + /// - history: Full message history, oldest-first. + /// - contextSize: Backend context window in tokens. + /// - reservedTokens: Tokens carved out of `contextSize` before history is + /// sized — response headroom **plus** a system-prompt allowance (the + /// real prompt isn't visible here). Single source of truth for the + /// reservation across all strategies. + /// - tokenizer: Optional tokenizer for cost estimation; heuristic + /// (chars/4) when `nil`, in which case the budget is **advisory**, not + /// guaranteed to match the backend's real token count. + /// - generate: Inference call for strategies that summarise. Zero-inference + /// strategies ignore it. A closure that yields empty text signals "no + /// usable summariser" — summarising strategies fall back accordingly. + /// - Returns: The replacement history, oldest-first. Never empty when + /// `history` is non-empty. + func compress( + history: [ChatMessage], + contextSize: Int, + reservedTokens: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] +} + +// MARK: - Shared helpers + +extension CompressionStrategy { + /// Per-message token estimate — sums across ALL content parts (text, + /// reasoning, image/audio, tool-call/-result), not just `.text`, so a + /// multimodal or tool-only message isn't counted as zero (#1885 finding 9). + func estimateTokens(_ message: ChatMessage, tokenizer: (any TokenizerProvider)?) -> Int { + ContextWindowManager.estimateTokenCount(message, tokenizer: tokenizer) + } + + /// Total tokens across a message set. + func estimateTokens(_ messages: [ChatMessage], tokenizer: (any TokenizerProvider)?) -> Int { + messages.reduce(0) { $0 + estimateTokens($1, tokenizer: tokenizer) } + } + + /// Token budget available for history: context window minus `reservedTokens` + /// (response headroom + system-prompt allowance). The session system prompt + /// is not visible here, so its real cost can't be subtracted — the + /// allowance baked into `reservedTokens` covers it. Returns `0` when the + /// reservation meets or exceeds the window; callers must guard that case + /// (`reservedTokens >= contextSize`) and skip compression rather than churn + /// against a zero budget. + func historyBudget(contextSize: Int, reservedTokens: Int) -> Int { + max(0, contextSize - reservedTokens) + } + + /// `true` for records that must survive compression regardless of budget: + /// `.system`-role prompt fragments and `.memory` summarisation artifacts + /// (a prior compression brief must not be evicted by a later pass). + /// + // TODO(#1885): honor per-message pins. The data already exists — + // `ChatSession.pinnedMessageIDs` holds the pinned IDs — but the + // `CompressionPolicy.compress(history:sessionID:generate:)` / + // `PreTurnCompressionPolicy.compressBeforeTurn(...)` signatures pass only a + // `[ChatMessage]` and the `sessionID`, not the pinned-ID set. Threading the + // pins through is a PROTOCOL-SIGNATURE change (a new parameter or a session + // handle), so it is deliberately out of scope for this PR; see the note on + // the `CompressionPolicy` protocol doc. + func isLoadBearing(_ message: ChatMessage) -> Bool { + if message.role == .system { return true } + if case .memory = message.kind { return true } + return false + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift new file mode 100644 index 000000000..1883771f7 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/DefaultCompressionPolicy.swift @@ -0,0 +1,229 @@ +import Foundation +import ManifoldInference + +/// Batteries-included compression policy. Pairs a ``CompressionStrategy`` with +/// a utilisation threshold and conforms to **both** compression seams — +/// ``CompressionPolicy`` (post-turn) and ``PreTurnCompressionPolicy`` +/// (pre-turn) — so one value drives either, or both, without a bespoke +/// conformance. +/// +/// Construct via the strategy factories rather than the initializer, and inject +/// the policy through ``ConversationRuntimeOptions`` (or the matching +/// `ConversationRuntime` init parameter) — there is no mutable +/// `runtime.compressionPolicy` property: +/// +/// ```swift +/// // Zero-inference, smart selection. Compress at 75% context utilisation. +/// let options = ConversationRuntimeOptions( +/// compressionPolicy: .extractive(threshold: 0.75, contextSize: 8_192) +/// ) +/// +/// // Cheapest baseline: drop oldest, keep system/summary records + newest. +/// let truncating = ConversationRuntimeOptions( +/// compressionPolicy: .truncating(threshold: 0.90, contextSize: 4_096) +/// ) +/// +/// // Inference-backed summary of old turns, kept-recent tail. Prefer the +/// // POST-turn seam: anchored runs a full summariser round-trip, and on the +/// // pre-turn seam that latency lands before the user's message even renders. +/// let anchored = ConversationRuntimeOptions( +/// compressionPolicy: .anchored(threshold: 0.85, contextSize: 8_192) +/// ) +/// +/// // For the PRE-turn seam, prefer a zero-inference strategy so turn setup +/// // pays no summariser latency: +/// let preTurn = ConversationRuntimeOptions( +/// preTurnCompressionPolicy: .extractive(threshold: 0.85, contextSize: 8_192) +/// ) +/// ``` +/// +/// ## Budget realism (seam constraint) +/// +/// The protocol `compress` signatures pass neither a system prompt nor a +/// tokenizer, so this policy holds both as configuration. `reservedTokens` +/// carves response headroom **plus** a system-prompt allowance out of +/// `contextSize`; the history budget is `contextSize − reservedTokens`. Pass a +/// real `tokenizer:` to the factories for a guaranteed budget — with +/// `tokenizer: nil` the budget is **advisory** (a chars/4 heuristic that can +/// diverge from the backend's real token count that drives the trigger). +/// +/// ## Trigger asymmetry +/// +/// ``CompressionPolicy/shouldCompress(promptTokens:contextSize:contextUtilization:)`` +/// receives utilisation directly. ``PreTurnCompressionPolicy`` does not — its +/// `shouldCompressBeforeTurn` sees only `messageCount` and `lastPromptTokens`, +/// so this policy stores `contextSize` and computes utilisation from +/// `lastPromptTokens / contextSize`. The two can disagree at the boundary by a +/// rounding margin: a post-turn caller that hands in an already-rounded +/// utilisation may cross the threshold while the pre-turn recompute from raw +/// `lastPromptTokens` stays just below it. +public struct DefaultCompressionPolicy: CompressionPolicy, PreTurnCompressionPolicy { + private let strategy: any CompressionStrategy + + /// Context-utilisation ratio at or above which compression triggers. + public let threshold: Double + /// Backend context window (tokens). Held as configuration because the + /// pre-turn seam does not pass it and the strategy needs it to size budget. + public let contextSize: Int + /// Tokens reserved out of `contextSize` before history is sized: response + /// headroom + a system-prompt allowance (the real system prompt is not + /// visible to the strategy). Single source of truth for the reservation. + public let reservedTokens: Int + private let tokenizer: (any TokenizerProvider)? + + /// Default reservation when a caller doesn't override it. + /// + /// The legacy value was a bare `512`, which only covered a short response + /// and left **nothing** for the session system prompt (which is not part of + /// `history` and so can't be subtracted directly here). `2048` reserves a + /// generous response headroom plus a system-prompt allowance: it matches + /// the `maxOutputTokens ?? 2048` default that `GenerationQueue` / + /// `PromptAssembler` already reserve at the wire layer, so the trigger and + /// the budget agree on roughly the same headroom. Reasoning models that + /// emit thousands of thinking tokens should raise this further (see + /// ``scaledReservedTokens(forContextSize:base:)``) — a too-small reserve is + /// the classic thinking-model overflow. + public static let defaultReservedTokens = 2_048 + + /// A context-scaled reservation: never below `base`, and at least ~12.5% of + /// the window so large contexts leave proportional response/thinking + /// headroom. Capped at half the window so a tiny `base` can't starve + /// history on small contexts. Use this for reasoning models. + public static func scaledReservedTokens( + forContextSize contextSize: Int, + base: Int = defaultReservedTokens + ) -> Int { + let scaled = max(base, contextSize / 8) + return min(scaled, max(base, contextSize / 2)) + } + + init( + strategy: any CompressionStrategy, + threshold: Double, + contextSize: Int, + reservedTokens: Int, + tokenizer: (any TokenizerProvider)? + ) { + self.strategy = strategy + self.threshold = threshold + self.contextSize = contextSize + self.reservedTokens = reservedTokens + self.tokenizer = tokenizer + } + + // MARK: - Factories + + /// Zero-inference sliding window: keep system/summary records + the newest + /// messages that fit, drop the oldest. + /// + /// - Parameters: + /// - reservedTokens: Tokens carved from `contextSize` for response + + /// system-prompt headroom (default ``defaultReservedTokens``). + /// - tokenizer: Inject the backend's tokenizer for a guaranteed budget; + /// `nil` (default) makes the budget advisory (chars/4 heuristic). + public static func truncating( + threshold: Double = 0.90, + contextSize: Int, + reservedTokens: Int = defaultReservedTokens, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: TruncatingCompressionStrategy(), + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer + ) + } + + /// Zero-inference scored selection (recency / length / keyword density). + /// `headBudgetFraction > 0` pins the oldest messages to counter the + /// "lost in the middle" effect. + public static func extractive( + threshold: Double = 0.75, + headBudgetFraction: Double = 0.0, + contextSize: Int, + reservedTokens: Int = defaultReservedTokens, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: ExtractiveCompressionStrategy(headBudgetFraction: headBudgetFraction), + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer + ) + } + + /// Inference-backed summary of old turns prepended to a verbatim recent + /// tail. Falls back to extractive when no summary can be produced. + /// + /// Prefer the **post-turn** seam for anchored: it runs a full summariser + /// round-trip, and on the pre-turn seam that latency is paid before the + /// user's just-typed message renders. + /// + /// - Parameters: + /// - summarizerInputWindow: the summariser's REAL window, used to size how + /// much old text it reads — set this to the backend's true context size + /// when `contextSize` is a small overflow trigger. + /// - summaryTemplate: custom prompt. Note the coupling with + /// `parseSummaryResponse`: a custom template should emit + /// `UPPERCASE-FIELD: value` lines (≥2) or the parser degrades to a + /// raw-truncated brief. `{old_text}` is the substitution placeholder. + public static func anchored( + threshold: Double = 0.85, + contextSize: Int, + reservedTokens: Int = defaultReservedTokens, + summarizerInputWindow: Int? = nil, + summaryTemplate: String? = nil, + tokenizer: (any TokenizerProvider)? = nil + ) -> DefaultCompressionPolicy { + DefaultCompressionPolicy( + strategy: AnchoredCompressionStrategy( + summarizerResponseBuffer: reservedTokens, + summarizerInputWindow: summarizerInputWindow, + summaryTemplate: summaryTemplate + ), + threshold: threshold, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer + ) + } + + // MARK: - CompressionPolicy (post-turn) + + public func shouldCompress(promptTokens: Int, contextSize: Int, contextUtilization: Double) -> Bool { + contextSize > 0 && contextUtilization >= threshold + } + + public func compress( + history: [ChatMessage], + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + // Guard the degenerate window: if the reservation meets or exceeds the + // context the history budget is zero, and every pass would report + // "over budget" forever (the 512-token simulator cap is the canonical + // trap). Skip rather than churn. + guard contextSize > reservedTokens else { + Log.inference.warning( + "[Compression] contextSize \(contextSize) <= reservedTokens \(reservedTokens); skipping compression (no usable history budget)" + ) + return history + } + return try await strategy.compress( + history: history, contextSize: contextSize, + reservedTokens: reservedTokens, tokenizer: tokenizer, generate: generate + ) + } + + // MARK: - PreTurnCompressionPolicy (pre-turn) + + public func shouldCompressBeforeTurn(messageCount: Int, lastPromptTokens: Int?) -> Bool { + guard contextSize > 0, let promptTokens = lastPromptTokens else { return false } + return Double(promptTokens) / Double(contextSize) >= threshold + } + + public func compressBeforeTurn( + history: [ChatMessage], + sessionID: UUID, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + try await compress(history: history, sessionID: sessionID, generate: generate) + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift new file mode 100644 index 000000000..327fdc984 --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/ExtractiveCompressionStrategy.swift @@ -0,0 +1,166 @@ +import Foundation +import ManifoldInference + +/// Zero-inference scored selection. Reserves a verbatim recency tail (and, +/// optionally, a verbatim head for establishing context), then greedily fills +/// the remaining budget with the highest-scoring older messages. Score blends +/// recency, content length, and capitalized-word density (a cheap proper-noun +/// proxy). +/// +/// Adapted from Fireside's `ExtractiveStoryCompressor`. The `headBudgetFraction` +/// knob is new: pinning the oldest messages counters the "lost in the middle" +/// effect, where models attend most to the start and end of context. +struct ExtractiveCompressionStrategy: CompressionStrategy { + let name = "extractive" + + /// Fraction of the budget reserved for the verbatim recency tail. + let tailBudgetFraction: Double + /// Fraction of the budget reserved for a verbatim head (oldest messages). + /// `0` disables head preservation (Fireside's original behavior). + let headBudgetFraction: Double + let recencyWeight: Double + let lengthWeight: Double + /// Weight for capitalized-word density. NOTE: this signal assumes + /// English-like prose where proper nouns and sentence starts are + /// capitalized. It degrades on all-lowercase text, source code, and + /// non-cased scripts (CJK), where density trends to ~0 and the term simply + /// drops out. It is the smallest weight (0.2) precisely so it only *nudges* + /// selection rather than dominating it. + let keywordDensityWeight: Double + + /// Combined ceiling for the verbatim head + tail fractions. Past this the + /// pinned-verbatim core could equal or exceed the whole budget, leaving no + /// room for scored selection and risking an over-budget result before + /// scoring even runs. + static let maxVerbatimCoreFraction = 0.8 + + init( + tailBudgetFraction: Double = 0.40, + headBudgetFraction: Double = 0.0, + recencyWeight: Double = 0.5, + lengthWeight: Double = 0.3, + keywordDensityWeight: Double = 0.2 + ) { + self.tailBudgetFraction = tailBudgetFraction + self.headBudgetFraction = headBudgetFraction + self.recencyWeight = recencyWeight + self.lengthWeight = lengthWeight + self.keywordDensityWeight = keywordDensityWeight + } + + func compress( + history: [ChatMessage], + contextSize: Int, + reservedTokens: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) + let tokens = history.map { estimateTokens($0, tokenizer: tokenizer) } + let originalTokens = tokens.reduce(0, +) + + // Everything fits, or a single message: never evict all history. + if originalTokens <= budget || history.count == 1 { + return history + } + + let count = history.count + var keep = Set() + var used = 0 + + // Load-bearing records are always kept. + for i in 0..= tailBudget { break } + } + // Always preserve the newest message. + if !keep.contains(count - 1) { + keep.insert(count - 1) + used += tokens[count - 1] + } + + // --- Verbatim head (oldest) — anti "lost in the middle" --- + if effectiveHeadFraction > 0 { + let headBudget = Int(Double(budget) * effectiveHeadFraction) + var headUsed = 0 + for i in 0..= headBudget { break } + } + } + + // --- Score and greedily select the remainder --- + struct Scored { let index: Int; let score: Double; let tokens: Int } + var candidates: [Scored] = [] + for i in 0.. 1 ? Double(i) / Double(count - 1) : 1.0 + let length = min(1.0, Double(tokens[i]) / 200.0) + let density = keywordDensity(of: history[i].content) + let score = recency * recencyWeight + length * lengthWeight + density * keywordDensityWeight + candidates.append(Scored(index: i, score: score, tokens: tokens[i])) + } + candidates.sort { $0.score > $1.score } + + for candidate in candidates { + if used + candidate.tokens <= budget { + keep.insert(candidate.index) + used += candidate.tokens + } + } + + // Final budget enforcement: the verbatim tail/head admission can push + // the union over budget when those bands admit large messages (each + // band only checks its own sub-budget). Evict kept non-load-bearing + // messages — oldest first, but never the newest — until the union fits. + // Load-bearing records are never evicted (they survive regardless of + // budget by contract). + if used > budget { + let newest = count - 1 + for i in 0.. Double { + let words = content.split(whereSeparator: { $0.isWhitespace }) + guard !words.isEmpty else { return 0 } + let capitalized = words.filter { $0.first?.isUppercase == true }.count + return Double(capitalized) / Double(words.count) + } +} diff --git a/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift new file mode 100644 index 000000000..4af0cc0da --- /dev/null +++ b/Sources/ManifoldRuntime/Services/Compression/TruncatingCompressionStrategy.swift @@ -0,0 +1,49 @@ +import Foundation +import ManifoldInference + +/// Zero-inference sliding window. Keeps every load-bearing record +/// (`.system` / `.memory`) plus the newest messages that fit the budget, +/// dropping the oldest. The cheapest possible strategy and a safe default when +/// no summariser is available. +struct TruncatingCompressionStrategy: CompressionStrategy { + let name = "truncating" + + func compress( + history: [ChatMessage], + contextSize: Int, + reservedTokens: Int, + tokenizer: (any TokenizerProvider)?, + generate: @Sendable ([ChatMessage]) async throws -> String + ) async throws -> [ChatMessage] { + guard !history.isEmpty else { return [] } + + let budget = historyBudget(contextSize: contextSize, reservedTokens: reservedTokens) + if estimateTokens(history, tokenizer: tokenizer) <= budget { + return history + } + + var kept = Set() + var used = 0 + + // Load-bearing records are always retained. + for (i, message) in history.enumerated() where isLoadBearing(message) { + kept.insert(i) + used += estimateTokens(message, tokenizer: tokenizer) + } + + // Fill remaining budget with the newest non-kept messages. + for i in stride(from: history.count - 1, through: 0, by: -1) { + if kept.contains(i) { continue } + let cost = estimateTokens(history[i], tokenizer: tokenizer) + if kept.isEmpty || used + cost <= budget { + kept.insert(i) + used += cost + } + } + + // Invariant: never drop the newest message. + kept.insert(history.count - 1) + + return kept.sorted().map { history[$0] } + } +} diff --git a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift index f23321caf..216ecac17 100644 --- a/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift +++ b/Tests/ManifoldInferenceTests/ContextWindowManagerTests.swift @@ -32,6 +32,68 @@ final class ContextWindowManagerTests: XCTestCase { XCTAssertEqual(ContextWindowManager.estimateTokenCount("a"), 1) } + // MARK: - Per-part / per-message estimation (#1885 finding 9) + + func test_estimateTokenCount_textPart_matchesStringOverload() { + let text = String(repeating: "a", count: 100) // 25 tokens + XCTAssertEqual(ContextWindowManager.estimateTokenCount(MessagePart.text(text)), + ContextWindowManager.estimateTokenCount(text)) + } + + func test_estimateTokenCount_imagePart_isNonZeroFixed() { + let part = MessagePart.image(data: Data(repeating: 0, count: 8), mimeType: "image/png") + XCTAssertEqual(ContextWindowManager.estimateTokenCount(part), + ContextWindowManager.imagePartTokenEstimate) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0, + "image must not estimate as zero") + } + + func test_estimateTokenCount_audioPart_isNonZeroFixed() { + let part = MessagePart.audio(url: URL(fileURLWithPath: "/tmp/a.m4a"), duration: 3, waveform: nil) + XCTAssertEqual(ContextWindowManager.estimateTokenCount(part), + ContextWindowManager.audioPartTokenEstimate) + } + + func test_estimateTokenCount_toolCallPart_countsNameAndArgs() { + let part = MessagePart.toolCall(ToolCall( + id: "c1", toolName: "get_weather", + arguments: String(repeating: "x", count: 80))) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0, + "tool call args must be counted") + } + + func test_estimateTokenCount_toolResultPart_countsPayload() { + let part = MessagePart.toolResult(ToolResult( + callId: "c1", content: String(repeating: "y", count: 120))) + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(part), 0) + } + + /// The crux: a message with ONLY non-text parts used to estimate as zero + /// via `.content`. The per-message estimate must be non-zero. + func test_estimateTokenCount_imageOnlyMessage_isNonZero() { + let msg = ChatMessage( + role: .user, + contentParts: [.image(data: Data(repeating: 0, count: 8), mimeType: "image/png")], + sessionID: UUID()) + XCTAssertEqual(msg.content, "", "precondition: .content is empty for image-only message") + XCTAssertGreaterThan(ContextWindowManager.estimateTokenCount(msg), 0, + "image-only message must not estimate as zero") + } + + func test_estimateTokenCount_message_sumsAcrossParts() { + let text = String(repeating: "a", count: 100) // 25 tokens + let msg = ChatMessage( + role: .user, + contentParts: [ + .text(text), + .image(data: Data(repeating: 0, count: 8), mimeType: "image/png"), + ], + sessionID: UUID()) + let expected = ContextWindowManager.estimateTokenCount(text) + + ContextWindowManager.imagePartTokenEstimate + XCTAssertEqual(ContextWindowManager.estimateTokenCount(msg), expected) + } + // MARK: - Context Size Resolution func test_resolveContextSize_sessionOverrideTakesPriority() { diff --git a/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift new file mode 100644 index 000000000..3ffb32d55 --- /dev/null +++ b/Tests/ManifoldRuntimeTests/DefaultCompressionPolicyTests.swift @@ -0,0 +1,502 @@ +@preconcurrency import XCTest +import Foundation +@testable import ManifoldRuntime +@testable import ManifoldInference + +/// Unit coverage for the batteries-included compression strategies and the +/// ``DefaultCompressionPolicy`` wrapper. Pure value transforms — no SwiftData, +/// no runtime wiring (that lives in `CompressionPolicyTests`). +final class DefaultCompressionPolicyTests: XCTestCase { + + // MARK: - Fixtures + + private let sessionID = UUID() + + /// Builds a message whose token cost scales with `words`. + private func msg(_ role: MessageRole, words: Int, kind: MessageKind = .chat) -> ChatMessage { + let content = Array(repeating: "lorem", count: words).joined(separator: " ") + return ChatMessage(role: role, content: content, sessionID: sessionID, kind: kind) + } + + /// Long alternating conversation that overflows `contextSize`. + private func overflowingHistory(turns: Int = 12, words: Int = 120) -> [ChatMessage] { + (0.. Int { max(0, contextSize - reservedTokens) } + private func tokens(_ messages: [ChatMessage]) -> Int { + messages.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: nil) } + } + + /// A real (non-nil) tokenizer to exercise the tokenizer-injected path. + /// Counts whitespace-separated words — deterministic and != the chars/4 + /// heuristic, so a test that passes it really takes the tokenizer branch. + private struct WordTokenizer: TokenizerProvider { + func tokenCount(_ text: String) -> Int { + max(1, text.split(whereSeparator: { $0.isWhitespace }).count) + } + } + + private static func echoGenerate(_: [ChatMessage]) async throws -> String { + "TOPIC: testing\nKEY POINTS: a; b; c\nLAST DISCUSSED: the end" + } + + // MARK: - Truncating + + func testTruncatingLeavesSmallHistoryUntouched() async throws { + let history = [msg(.user, words: 5), msg(.assistant, words: 5)] + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) + XCTAssertEqual(out.map(\.id), history.map(\.id)) + } + + func testTruncatingDropsOldestAndKeepsNewest() async throws { + let history = overflowingHistory() + XCTAssertGreaterThan(tokens(history), budget()) // precondition: actually overflows + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) + + XCTAssertLessThan(out.count, history.count, "expected eviction") + XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive") + XCTAssertFalse(out.contains { $0.id == history.first?.id }, "oldest should be dropped") + XCTAssertLessThanOrEqual(tokens(out), budget()) + } + + /// When load-bearing records alone exhaust the budget, the greedy backward + /// fill cannot admit the newest chat message — only the explicit + /// never-drop-newest invariant keeps it. Guards that invariant directly + /// (the over-budget-tail path the other truncating tests don't exercise). + func testTruncatingKeepsNewestEvenWhenLoadBearingExceedsBudget() async throws { + let history = [ + msg(.system, words: 4_000, kind: .chat), // load-bearing, alone over budget + msg(.user, words: 5), + msg(.assistant, words: 5) // newest, tiny + ] + XCTAssertGreaterThan(tokens([history[0]]), budget(), "precondition: load-bearing alone overflows") + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) + + XCTAssertEqual(out.last?.id, history.last?.id, "newest must survive even when load-bearing fills the budget") + XCTAssertTrue(out.contains { $0.role == .system }, "load-bearing record retained") + } + + func testTruncatingPreservesLoadBearingRecords() async throws { + var history = [msg(.system, words: 10, kind: .chat)] // system role + history.append(msg(.assistant, words: 10, kind: .memory("summary"))) // memory kind + history.append(contentsOf: overflowingHistory()) + + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) + + XCTAssertTrue(out.contains { $0.role == .system }, "system prompt must survive") + XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }, + "prior summary must survive") + } + + /// Real-tokenizer path: a deterministic word tokenizer (not chars/4) must + /// still reduce below the budget it computes. + func testTruncatingWithRealTokenizerReducesBelowBudget() async throws { + let tok = WordTokenizer() + let history = overflowingHistory(turns: 20, words: 200) + let out = try await TruncatingCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: tok, generate: { _ in "" }) + let usedWords = out.reduce(0) { $0 + ContextWindowManager.estimateTokenCount($1, tokenizer: tok) } + XCTAssertLessThanOrEqual(usedWords, budget()) + XCTAssertEqual(out.last?.id, history.last?.id) + } + + // MARK: - Extractive + + func testExtractiveReducesBelowBudgetAndKeepsNewest() async throws { + let history = overflowingHistory() + let out = try await ExtractiveCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: { _ in "" }) + + XCTAssertLessThanOrEqual(tokens(out), budget()) + XCTAssertEqual(out.last?.id, history.last?.id) + // Output must be strictly increasing in original history index — guards + // chronological order against re-ordering (not a self-sort tautology). + let indices = out.map { m in history.firstIndex { $0.id == m.id }! } + XCTAssertEqual(indices, indices.sorted(), "output indices must be sorted") + for i in 1..` scratchpad must not appear in the summary record. + func testAnchoredStripsLeakedThinkingFromSummary() async throws { + let history = overflowingHistory() + let leaky: @Sendable ([ChatMessage]) async throws -> String = { _ in + "I should mention SECRET_LEAK while reasoning\nTOPIC: testing\nKEY POINTS: a; b; c" + } + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: leaky) + let summary = try XCTUnwrap(out.first) + XCTAssertFalse(summary.content.contains("SECRET_LEAK"), "thinking must be stripped") + XCTAssertFalse(summary.content.contains("")) + XCTAssertTrue(summary.content.contains("TOPIC"), "visible fields survive") + } + + /// Chunk-and-fold: oldText exceeds the usable summariser input window, so + /// the strategy chunks. `generate` records every prompt; assert ≥2 calls + /// and that content from the OLDEST chunk is represented in the brief. + func testAnchoredChunkAndFold() async throws { + actor PromptRecorder { + var prompts: [String] = [] + func record(_ p: String) { prompts.append(p) } + } + let recorder = PromptRecorder() + let history = overflowingHistory(turns: 30, words: 120) + // First message carries a unique marker we can trace to the oldest chunk. + var tagged = history + tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID) + + let generate: @Sendable ([ChatMessage]) async throws -> String = { msgs in + let prompt = msgs.first?.content ?? "" + await recorder.record(prompt) + // Echo back any marker the chunk contained so it reaches the fold. + if prompt.contains("OLDEST_MARKER") { + return "TOPIC: oldest\nKEY POINTS: OLDEST_MARKER seen; b; c" + } + return "TOPIC: chunk\nKEY POINTS: a; b; c" + } + + // summarizerInputWindow small enough that old text exceeds the usable budget. + let out = try await AnchoredCompressionStrategy( + summarizerResponseBuffer: 64, summarizerInputWindow: 600 + ).compress( + history: tagged, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: generate) + + let calls = await recorder.prompts + XCTAssertGreaterThanOrEqual(calls.count, 2, "chunking should produce ≥2 generate calls (chunks + fold)") + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary") } + XCTAssertTrue(summary.content.contains("OLDEST_MARKER") || calls.contains { $0.contains("OLDEST_MARKER") }, + "oldest chunk content must be represented") + } + + /// Chunk-failure: the FIRST chunk's `generate` throws → that chunk's raw + /// content is preserved via the truncated-text fallback (not lost), and the + /// overall compression still produces a `.memory` summary because the fold + /// (a later call) succeeds. + func testAnchoredChunkFailurePreservesContent() async throws { + actor CallCounter { var n = 0; func next() -> Int { n += 1; return n } } + let counter = CallCounter() + let history = overflowingHistory(turns: 30, words: 120) + var tagged = history + tagged[0] = ChatMessage(role: .user, content: "OLDEST_MARKER " + history[0].content, sessionID: sessionID) + + // Throw on the first generate call (a chunk), succeed on all later calls + // (remaining chunks + the fold). The first chunk's raw text falls back + // via truncateToFit so its content is not dropped. + let generate: @Sendable ([ChatMessage]) async throws -> String = { _ in + struct ChunkBoom: Error {} + if await counter.next() == 1 { throw ChunkBoom() } + return "TOPIC: chunk\nKEY POINTS: a; b; c" + } + let out = try await AnchoredCompressionStrategy( + summarizerResponseBuffer: 64, summarizerInputWindow: 600 + ).compress( + history: tagged, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: generate) + // The summary record exists (top-level summarise succeeded on the fold). + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary despite chunk failure") } + let totalCalls = await counter.n + XCTAssertGreaterThanOrEqual(totalCalls, 2, "should retry remaining chunks + fold after one chunk failed") + XCTAssertFalse(out.isEmpty) + } + + /// Summary-floor: the tail consumes ~the whole budget, leaving no room for + /// the summary. The strategy must still emit a non-empty `.memory` record + /// AND keep the result within budget. + func testAnchoredSummaryFloor() async throws { + // tailBudgetFraction 0.95 → tail eats almost all budget. + let history = overflowingHistory(turns: 20, words: 120) + let out = try await AnchoredCompressionStrategy(tailBudgetFraction: 0.95).compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: Self.echoGenerate) + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected floored memory summary") } + XCTAssertFalse(summary.content.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty, + "summary floor must produce a non-empty brief") + XCTAssertLessThanOrEqual(tokens(out), budget()) + } + + /// Cancellation mid-summarise: cancelling the surrounding Task returns the + /// tail with NO summary record. + func testAnchoredCancellationMidSummarize() async throws { + let history = overflowingHistory() + // Hoist instance properties to locals so the Task closure doesn't + // capture (non-Sendable) `self`. + let ctx = contextSize + let reserve = reservedTokens + let task = Task { () -> [ChatMessage] in + try await AnchoredCompressionStrategy().compress( + history: history, contextSize: ctx, reservedTokens: reserve, + tokenizer: nil, + generate: { _ in + // Yield so cancellation lands before/within summarise. + try await Task.sleep(nanoseconds: 50_000_000) + return "TOPIC: x\nKEY POINTS: a; b" + }) + } + task.cancel() + let out = try await task.value + XCTAssertFalse(out.contains { if case .memory = $0.kind { return true }; return false }, + "cancellation must not inject a summary record") + XCTAssertEqual(out.last?.id, history.last?.id, "tail preserved on cancel") + } + + /// `parseSummaryResponse` <2-field raw-fallback branch: a summary with only + /// one recognisable field degrades to the trimmed raw response, not an + /// empty/placeholder brief. + func testAnchoredSingleFieldRawFallback() async throws { + let history = overflowingHistory() + let oneField: @Sendable ([ChatMessage]) async throws -> String = { _ in + "TOPIC: only one field here and some prose that should survive verbatim" + } + let out = try await AnchoredCompressionStrategy().compress( + history: history, contextSize: contextSize, reservedTokens: reservedTokens, + tokenizer: nil, generate: oneField) + let summary = try XCTUnwrap(out.first) + guard case .memory = summary.kind else { return XCTFail("expected memory summary") } + XCTAssertTrue(summary.content.contains("prose that should survive"), + "single-field response degrades to trimmed raw text") + } + + // MARK: - Policy thresholds & seam agreement + + func testShouldCompressHonorsThreshold() { + let policy = DefaultCompressionPolicy.extractive(threshold: 0.75, contextSize: contextSize) + XCTAssertTrue(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.80)) + XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: contextSize, contextUtilization: 0.50)) + XCTAssertFalse(policy.shouldCompress(promptTokens: 0, contextSize: 0, contextUtilization: 0.99), + "unknown context size never compresses") + } + + /// Trigger boundary, hand-computed. Both seams fire at exactly the + /// threshold and decline just below it; and the rounding asymmetry where a + /// post-turn caller passing a rounded utilisation fires while the pre-turn + /// recompute from raw tokens stays below. + func testTriggerAsymmetryBoundary() { + let threshold = 0.80 + let policy = DefaultCompressionPolicy.truncating(threshold: threshold, contextSize: contextSize) + + // promptTokens chosen so utilisation == threshold exactly. + let atTokens = Int(threshold * Double(contextSize)) // 0.80 * 2048 = 1638 (1638/2048 = 0.7998…) + // At the exact integer-token boundary the recomputed utilisation may be + // a hair below threshold; assert both seams agree with the hand-computed + // recompute regardless. + let atUtil = Double(atTokens) / Double(contextSize) + let preAt = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: atTokens) + let postAt = policy.shouldCompress(promptTokens: atTokens, contextSize: contextSize, contextUtilization: atUtil) + XCTAssertEqual(preAt, postAt) + XCTAssertEqual(preAt, atUtil >= threshold, "pre-turn matches hand-computed bool at boundary") + + // threshold − epsilon: definitively below → both decline. + let belowTokens = Int((threshold - 0.01) * Double(contextSize)) + let belowUtil = Double(belowTokens) / Double(contextSize) + XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: belowTokens)) + XCTAssertFalse(policy.shouldCompress(promptTokens: belowTokens, contextSize: contextSize, contextUtilization: belowUtil)) + + // Rounding asymmetry: a post-turn caller that ROUNDS utilisation up to + // the threshold fires, while the pre-turn recompute from raw tokens + // (which is just under) does not. Demonstrates the documented seam gap. + let justUnderTokens = Int(threshold * Double(contextSize)) - 1 // 1637/2048 = 0.79931 < 0.80 + let recomputed = Double(justUnderTokens) / Double(contextSize) + XCTAssertLessThan(recomputed, threshold, "raw recompute is below threshold") + let preJustUnder = policy.shouldCompressBeforeTurn(messageCount: 1, lastPromptTokens: justUnderTokens) + let postRounded = policy.shouldCompress(promptTokens: justUnderTokens, contextSize: contextSize, + contextUtilization: threshold) // caller passes rounded value + XCTAssertFalse(preJustUnder, "pre-turn recompute declines just below threshold") + XCTAssertTrue(postRounded, "post-turn fires when caller passes a utilisation already at threshold") + } + + func testPreTurnWithoutPriorTokensDoesNotCompress() { + let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: contextSize) + XCTAssertFalse(policy.shouldCompressBeforeTurn(messageCount: 500, lastPromptTokens: nil)) + } + + func testPolicyCompressDelegatesToStrategy() async throws { + // Context comfortably above the default 2048 reserve so the policy's + // small-window guard doesn't skip; history overflows the resulting budget. + let largeContext = 8_192 + let history = overflowingHistory(turns: 80, words: 120) + let policy = DefaultCompressionPolicy.anchored(threshold: 0.85, contextSize: largeContext) + let out = try await policy.compress(history: history, sessionID: sessionID, generate: Self.echoGenerate) + XCTAssertTrue(out.contains { if case .memory = $0.kind { return true }; return false }) + + // compressBeforeTurn shares the same path. + let preOut = try await policy.compressBeforeTurn(history: history, sessionID: sessionID, generate: Self.echoGenerate) + XCTAssertEqual(preOut.first?.kind.rawStorage, out.first?.kind.rawStorage) + } + + /// Finding 1 guard: a context window at or below the reservation has no + /// usable history budget; the policy must skip compression (return history + /// unchanged) rather than churn against a zero/negative budget. + func testPolicySkipsWhenContextSmallerThanReserve() async throws { + // 512-token simulator cap with the default 2048 reserve. + let policy = DefaultCompressionPolicy.truncating(contextSize: 512) + let history = overflowingHistory() + let out = try await policy.compress(history: history, sessionID: sessionID, generate: { _ in "" }) + XCTAssertEqual(out.map(\.id), history.map(\.id), "no usable budget → history unchanged") + } + + /// The default reserve is clearly larger than the legacy 512. + func testDefaultReserveIsLargerThanLegacy() { + XCTAssertGreaterThan(DefaultCompressionPolicy.defaultReservedTokens, 512) + } + + /// Context-scaled reserve grows with the window but stays under half of it. + func testScaledReserveBounds() { + let small = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 4_096) + XCTAssertGreaterThanOrEqual(small, DefaultCompressionPolicy.defaultReservedTokens) + let large = DefaultCompressionPolicy.scaledReservedTokens(forContextSize: 131_072) + XCTAssertEqual(large, 131_072 / 8, "scales to ~12.5% of a big window") + XCTAssertLessThanOrEqual(large, 131_072 / 2, "never exceeds half the window") + } +}