diff --git a/packages/core/src/core/compression/utils.test.ts b/packages/core/src/core/compression/utils.test.ts index 4fd536cc58..980d1c7b6a 100644 --- a/packages/core/src/core/compression/utils.test.ts +++ b/packages/core/src/core/compression/utils.test.ts @@ -14,7 +14,7 @@ */ import { describe, it, expect } from 'vitest'; -import type { IContent } from '../../services/history/IContent.js'; +import type { IContent, MediaBlock } from '../../services/history/IContent.js'; import { adjustForToolCallBoundary, findForwardValidSplitPoint, @@ -65,6 +65,38 @@ function toolResponseMsg( ], }; } +function mediaBlock( + mimeType: string, + filename?: string, + data = 'base64data', + caption?: string, +): MediaBlock { + return { + type: 'media', + mimeType, + filename, + data, + encoding: 'base64', + caption, + }; +} + +function humanMsgWithMedia( + text: string, + ...mediaBlocks: MediaBlock[] +): IContent { + return { + speaker: 'human', + blocks: [{ type: 'text', text }, ...mediaBlocks], + }; +} + +function humanMsgOnlyMedia(...mediaBlocks: MediaBlock[]): IContent { + return { + speaker: 'human', + blocks: mediaBlocks, + }; +} // --------------------------------------------------------------------------- // adjustForToolCallBoundary @@ -620,4 +652,185 @@ describe('sanitizeHistoryForCompression', () => { const result = sanitizeHistoryForCompression([msg]); expect(result[0].speaker).toBe('human'); }); + + // Media block tests (Issue #1875) + it('converts media blocks to text placeholders with filename', () => { + const history = [ + humanMsgOnlyMedia(mediaBlock('application/pdf', 'document.pdf')), + ]; + const result = sanitizeHistoryForCompression(history); + expect(result).toHaveLength(1); + expect(result[0].speaker).toBe('human'); + expect(result[0].blocks).toHaveLength(1); + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached PDF: document.pdf]', + ); + }); + + it('converts media blocks to text placeholders using mimeType when no filename', () => { + const history = [humanMsgOnlyMedia(mediaBlock('image/png'))]; + const result = sanitizeHistoryForCompression(history); + expect(result).toHaveLength(1); + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached image: image/png]', + ); + }); + + it('converts media blocks with empty filename to text placeholders using mimeType', () => { + const history = [humanMsgOnlyMedia(mediaBlock('image/png', ''))]; + const result = sanitizeHistoryForCompression(history); + expect(result).toHaveLength(1); + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached image: image/png]', + ); + }); + + it('handles different media categories (image, pdf, audio, video, unknown)', () => { + const imageBlock = mediaBlock('image/jpeg', 'photo.jpg'); + const pdfBlock = mediaBlock('application/pdf', 'report.pdf'); + const audioBlock = mediaBlock('audio/mp3', 'song.mp3'); + const videoBlock = mediaBlock('video/mp4', 'movie.mp4'); + const unknownBlock = mediaBlock('application/octet-stream', 'data.bin'); + + const history = [ + humanMsgOnlyMedia( + imageBlock, + pdfBlock, + audioBlock, + videoBlock, + unknownBlock, + ), + ]; + const result = sanitizeHistoryForCompression(history); + const texts = result[0].blocks.map((b) => (b as { text: string }).text); + + expect(texts).toContain('[Attached image: photo.jpg]'); + expect(texts).toContain('[Attached PDF: report.pdf]'); + expect(texts).toContain('[Attached audio: song.mp3]'); + expect(texts).toContain('[Attached video: movie.mp4]'); + expect(texts).toContain('[Attached unknown: data.bin]'); + }); + + it('handles mixed content with text + media blocks in same message', () => { + const history = [ + humanMsgWithMedia( + 'Please analyze this document', + mediaBlock('application/pdf', 'report.pdf'), + ), + ]; + const result = sanitizeHistoryForCompression(history); + expect(result[0].blocks).toHaveLength(2); + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + 'Please analyze this document', + ); + expect(result[0].blocks[1].type).toBe('text'); + expect((result[0].blocks[1] as { text: string }).text).toBe( + '[Attached PDF: report.pdf]', + ); + }); + + it('converts messages with only media blocks properly', () => { + const history = [ + humanMsgOnlyMedia(mediaBlock('image/png', 'screenshot.png')), + ]; + const result = sanitizeHistoryForCompression(history); + expect(result).toHaveLength(1); + expect(result[0].speaker).toBe('human'); + expect(result[0].blocks).toHaveLength(1); + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached image: screenshot.png]', + ); + }); + + it('does not change speaker for media block messages (unlike tool messages)', () => { + // Media blocks keep original speaker, only tool messages get re-tagged + const msg: IContent = { + speaker: 'ai', + blocks: [mediaBlock('image/jpeg', 'photo.jpg')], + }; + const result = sanitizeHistoryForCompression([msg]); + expect(result[0].speaker).toBe('ai'); // unchanged + expect(result[0].blocks[0].type).toBe('text'); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached image: photo.jpg]', + ); + }); + + it('handles mixed tool and media blocks in same message', () => { + const msg: IContent = { + speaker: 'ai', + blocks: [ + { type: 'text', text: 'Analyzing file and document' }, + mediaBlock('application/pdf', 'document.pdf'), + { + type: 'tool_call', + id: 'c1', + name: 'read_file', + parameters: { path: '/tmp/test' }, + }, + ], + }; + const result = sanitizeHistoryForCompression([msg]); + expect(result[0].blocks).toHaveLength(3); + expect((result[0].blocks[0] as { text: string }).text).toBe( + 'Analyzing file and document', + ); + expect((result[0].blocks[1] as { text: string }).text).toBe( + '[Attached PDF: document.pdf]', + ); + expect((result[0].blocks[2] as { text: string }).text).toContain( + '[Tool Call: read_file]', + ); + }); + + it('prefers media caption over filename in compression placeholders', () => { + const history = [ + humanMsgOnlyMedia( + mediaBlock( + 'image/png', + 'diagram.png', + 'base64data', + 'Architecture diagram', + ), + ), + ]; + const result = sanitizeHistoryForCompression(history); + expect(result).toHaveLength(1); + expect(result[0].blocks).toHaveLength(1); + expect((result[0].blocks[0] as { text: string }).text).toBe( + '[Attached image: Architecture diagram]', + ); + }); + + it('re-tags tool speaker to human and placeholderizes media blocks in tool messages', () => { + const msg: IContent = { + speaker: 'tool', + blocks: [ + { + type: 'tool_response', + callId: 'c1', + toolName: 'read_file', + result: 'file contents here', + }, + mediaBlock('image/png', 'screenshot.png'), + ], + }; + const result = sanitizeHistoryForCompression([msg]); + expect(result[0].speaker).toBe('human'); + expect(result[0].blocks).toHaveLength(2); + expect((result[0].blocks[0] as { text: string }).text).toContain( + '[Tool Result: read_file]', + ); + expect((result[0].blocks[0] as { text: string }).text).toContain( + 'file contents here', + ); + expect((result[0].blocks[1] as { text: string }).text).toBe( + '[Attached image: screenshot.png]', + ); + }); }); diff --git a/packages/core/src/core/compression/utils.ts b/packages/core/src/core/compression/utils.ts index 1a7ea98918..704785b2a4 100644 --- a/packages/core/src/core/compression/utils.ts +++ b/packages/core/src/core/compression/utils.ts @@ -18,9 +18,11 @@ import type { ContentBlock, IContent, + MediaBlock, TextBlock, } from '../../services/history/IContent.js'; import type { IProvider } from '../../providers/IProvider.js'; +import { classifyMediaBlock } from '../../providers/utils/mediaUtils.js'; /** * Aggregate text from content blocks, handling spacing between text and @@ -337,15 +339,40 @@ export async function runVerificationPass( } /** - * Convert tool_call and tool_response blocks to plain text representations - * so the compression request doesn't trip Anthropic's strict tool_use / - * tool_result pairing validation. Orphaned tool blocks (from interrupted - * loops or the loop-detector halting mid-tool-call) would otherwise cause - * 400 errors when sent to the LLM for summarisation. + * Convert a MediaBlock to a concise text placeholder for compression. + * This prevents provider-specific media types (like PDF "file" parts) from + * reaching the compression LLM call, which would cause 400 errors on providers + * that don't support certain media types. + * + * Format: [Attached : ] + * The identifier prefers caption for accessibility/context, then falls back to + * filename, mimeType, and finally "unknown". + */ +export function mediaBlockToCompressionPlaceholder(media: MediaBlock): string { + const category = classifyMediaBlock(media); + // Prefer caption first (for accessibility/context), then filename, then mimeType, then 'unknown' + const identifier = + media.caption?.trim() || + media.filename?.trim() || + media.mimeType || + 'unknown'; + // Capitalize PDF label for display, keep other categories as-is + const label = category === 'pdf' ? 'PDF' : category; + return `[Attached ${label}: ${identifier}]`; +} + +/** + * Convert tool_call, tool_response, and media blocks to plain text representations + * so the compression request doesn't trip provider-specific validation errors. + * + * - Tool blocks: Anthropic's strict tool_use / tool_result pairing validation + * would reject orphaned tool blocks (from interrupted loops). + * - Media blocks: Providers like Kimi don't support certain media types (e.g., + * PDF "file" parts) and would return 400 errors. * * Messages whose speaker is 'tool' are re-tagged as 'human' since they - * no longer carry structural tool_result blocks. All other block types - * (text, thinking, code, media) pass through unchanged. + * no longer carry structural tool_result blocks. Messages with media blocks + * keep their original speaker since media is not speaker-specific. */ export function sanitizeHistoryForCompression( messages: readonly IContent[], @@ -354,7 +381,8 @@ export function sanitizeHistoryForCompression( const hasToolBlocks = msg.blocks.some( (b) => b.type === 'tool_call' || b.type === 'tool_response', ); - if (!hasToolBlocks && msg.speaker !== 'tool') { + const hasMediaBlocks = msg.blocks.some((b) => b.type === 'media'); + if (!hasToolBlocks && !hasMediaBlocks && msg.speaker !== 'tool') { return msg; } @@ -390,6 +418,10 @@ export function sanitizeHistoryForCompression( } return { type: 'text', text } as TextBlock; } + if (block.type === 'media') { + const text = mediaBlockToCompressionPlaceholder(block); + return { type: 'text', text } as TextBlock; + } return block; }) .filter((b): b is ContentBlock => b !== null);