diff --git a/codex-rs/core/src/chat_completions.rs b/codex-rs/core/src/chat_completions.rs index 76cbdf9f0a..1fb2230d79 100644 --- a/codex-rs/core/src/chat_completions.rs +++ b/codex-rs/core/src/chat_completions.rs @@ -17,6 +17,7 @@ use crate::util::backoff; use bytes::Bytes; use codex_otel::otel_event_manager::OtelEventManager; use codex_protocol::models::ContentItem; +use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::ReasoningItemContent; use codex_protocol::models::ResponseItem; use eventsource_stream::Eventsource; @@ -159,16 +160,26 @@ pub(crate) async fn stream_chat_completions( for (idx, item) in input.iter().enumerate() { match item { ResponseItem::Message { role, content, .. } => { + // Build content either as a plain string (typical for assistant text) + // or as an array of content items when images are present (user/tool multimodal). let mut text = String::new(); + let mut items: Vec = Vec::new(); + let mut saw_image = false; + for c in content { match c { ContentItem::InputText { text: t } | ContentItem::OutputText { text: t } => { text.push_str(t); + items.push(json!({"type":"text","text": t})); + } + ContentItem::InputImage { image_url } => { + saw_image = true; + items.push(json!({"type":"image_url","image_url": {"url": image_url}})); } - _ => {} } } + // Skip exact-duplicate assistant messages. if role == "assistant" { if let Some(prev) = &last_assistant_text @@ -179,7 +190,17 @@ pub(crate) async fn stream_chat_completions( last_assistant_text = Some(text.clone()); } - let mut msg = json!({"role": role, "content": text}); + // For assistant messages, always send a plain string for compatibility. + // For user messages, if an image is present, send an array of content items. + let content_value = if role == "assistant" { + json!(text) + } else if saw_image { + json!(items) + } else { + json!(text) + }; + + let mut msg = json!({"role": role, "content": content_value}); if role == "assistant" && let Some(reasoning) = reasoning_by_anchor_index.get(&idx) && let Some(obj) = msg.as_object_mut() @@ -238,10 +259,29 @@ pub(crate) async fn stream_chat_completions( messages.push(msg); } ResponseItem::FunctionCallOutput { call_id, output } => { + // Prefer structured content items when available (e.g., images) + // otherwise fall back to the legacy plain-string content. + let content_value = if let Some(items) = &output.content_items { + let mapped: Vec = items + .iter() + .map(|it| match it { + FunctionCallOutputContentItem::InputText { text } => { + json!({"type":"text","text": text}) + } + FunctionCallOutputContentItem::InputImage { image_url } => { + json!({"type":"image_url","image_url": {"url": image_url}}) + } + }) + .collect(); + json!(mapped) + } else { + json!(output.content) + }; + messages.push(json!({ "role": "tool", "tool_call_id": call_id, - "content": output.content, + "content": content_value, })); } ResponseItem::CustomToolCall { diff --git a/codex-rs/core/src/codex.rs b/codex-rs/core/src/codex.rs index f811211c4f..6342635775 100644 --- a/codex-rs/core/src/codex.rs +++ b/codex-rs/core/src/codex.rs @@ -2047,7 +2047,7 @@ async fn try_run_turn( call_id: String::new(), output: FunctionCallOutputPayload { content: msg.to_string(), - success: None, + ..Default::default() }, }; add_completed(ProcessedResponseItem { @@ -2061,7 +2061,7 @@ async fn try_run_turn( call_id: String::new(), output: FunctionCallOutputPayload { content: message, - success: None, + ..Default::default() }, }; add_completed(ProcessedResponseItem { @@ -2199,41 +2199,6 @@ pub(super) fn get_last_assistant_message_from_turn(responses: &[ResponseItem]) - } }) } -pub(crate) fn convert_call_tool_result_to_function_call_output_payload( - call_tool_result: &CallToolResult, -) -> FunctionCallOutputPayload { - let CallToolResult { - content, - is_error, - structured_content, - } = call_tool_result; - - // In terms of what to send back to the model, we prefer structured_content, - // if available, and fallback to content, otherwise. - let mut is_success = is_error != &Some(true); - let content = if let Some(structured_content) = structured_content - && structured_content != &serde_json::Value::Null - && let Ok(serialized_structured_content) = serde_json::to_string(&structured_content) - { - serialized_structured_content - } else { - match serde_json::to_string(&content) { - Ok(serialized_content) => serialized_content, - Err(err) => { - // If we could not serialize either content or structured_content to - // JSON, flag this as an error. - is_success = false; - err.to_string() - } - } - }; - - FunctionCallOutputPayload { - content, - success: Some(is_success), - } -} - /// Emits an ExitedReviewMode Event with optional ReviewOutput, /// and records a developer message with the review output. pub(crate) async fn exit_review_mode( @@ -2443,7 +2408,7 @@ mod tests { })), }; - let got = convert_call_tool_result_to_function_call_output_payload(&ctr); + let got = FunctionCallOutputPayload::from(&ctr); let expected = FunctionCallOutputPayload { content: serde_json::to_string(&json!({ "ok": true, @@ -2451,6 +2416,7 @@ mod tests { })) .unwrap(), success: Some(true), + ..Default::default() }; assert_eq!(expected, got); @@ -2572,11 +2538,12 @@ mod tests { structured_content: Some(serde_json::Value::Null), }; - let got = convert_call_tool_result_to_function_call_output_payload(&ctr); + let got = FunctionCallOutputPayload::from(&ctr); let expected = FunctionCallOutputPayload { content: serde_json::to_string(&vec![text_block("hello"), text_block("world")]) .unwrap(), success: Some(true), + ..Default::default() }; assert_eq!(expected, got); @@ -2590,10 +2557,11 @@ mod tests { structured_content: Some(json!({ "message": "bad" })), }; - let got = convert_call_tool_result_to_function_call_output_payload(&ctr); + let got = FunctionCallOutputPayload::from(&ctr); let expected = FunctionCallOutputPayload { content: serde_json::to_string(&json!({ "message": "bad" })).unwrap(), success: Some(false), + ..Default::default() }; assert_eq!(expected, got); @@ -2607,10 +2575,11 @@ mod tests { structured_content: None, }; - let got = convert_call_tool_result_to_function_call_output_payload(&ctr); + let got = FunctionCallOutputPayload::from(&ctr); let expected = FunctionCallOutputPayload { content: serde_json::to_string(&vec![text_block("alpha")]).unwrap(), success: Some(true), + ..Default::default() }; assert_eq!(expected, got); diff --git a/codex-rs/core/src/conversation_history.rs b/codex-rs/core/src/conversation_history.rs index 08fa8cebed..bf37351086 100644 --- a/codex-rs/core/src/conversation_history.rs +++ b/codex-rs/core/src/conversation_history.rs @@ -110,7 +110,7 @@ impl ConversationHistory { call_id: call_id.clone(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, )); @@ -157,7 +157,7 @@ impl ConversationHistory { call_id: call_id.clone(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, )); @@ -454,7 +454,7 @@ mod tests { call_id: "call-1".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }, ]; @@ -470,7 +470,7 @@ mod tests { call_id: "call-2".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }, ResponseItem::FunctionCall { @@ -504,7 +504,7 @@ mod tests { call_id: "call-3".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }, ]; @@ -560,7 +560,7 @@ mod tests { call_id: "call-x".to_string(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, ] @@ -637,7 +637,7 @@ mod tests { call_id: "shell-1".to_string(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, ] @@ -651,7 +651,7 @@ mod tests { call_id: "orphan-1".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }]; let mut h = create_history_with_items(items); @@ -691,7 +691,7 @@ mod tests { call_id: "c2".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }, // Will get an inserted custom tool output @@ -733,7 +733,7 @@ mod tests { call_id: "c1".to_string(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, ResponseItem::CustomToolCall { @@ -763,7 +763,7 @@ mod tests { call_id: "s1".to_string(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, ] @@ -828,7 +828,7 @@ mod tests { call_id: "orphan-1".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }]; let mut h = create_history_with_items(items); @@ -862,7 +862,7 @@ mod tests { call_id: "c2".to_string(), output: FunctionCallOutputPayload { content: "ok".to_string(), - success: None, + ..Default::default() }, }, ResponseItem::CustomToolCall { diff --git a/codex-rs/core/src/mcp_tool_call.rs b/codex-rs/core/src/mcp_tool_call.rs index 09846b719a..5166410372 100644 --- a/codex-rs/core/src/mcp_tool_call.rs +++ b/codex-rs/core/src/mcp_tool_call.rs @@ -35,6 +35,7 @@ pub(crate) async fn handle_mcp_tool_call( output: FunctionCallOutputPayload { content: format!("err: {e}"), success: Some(false), + ..Default::default() }, }; } diff --git a/codex-rs/core/src/response_processing.rs b/codex-rs/core/src/response_processing.rs index e18fdd45c8..5c30c1126c 100644 --- a/codex-rs/core/src/response_processing.rs +++ b/codex-rs/core/src/response_processing.rs @@ -61,14 +61,11 @@ pub(crate) async fn process_items( ) => { items_to_record_in_conversation_history.push(item); let output = match result { - Ok(call_tool_result) => { - crate::codex::convert_call_tool_result_to_function_call_output_payload( - call_tool_result, - ) - } + Ok(call_tool_result) => FunctionCallOutputPayload::from(call_tool_result), Err(err) => FunctionCallOutputPayload { content: err.clone(), success: Some(false), + ..Default::default() }, }; items_to_record_in_conversation_history.push(ResponseItem::FunctionCallOutput { diff --git a/codex-rs/core/src/tools/context.rs b/codex-rs/core/src/tools/context.rs index 27d309dc25..d2e47f926f 100644 --- a/codex-rs/core/src/tools/context.rs +++ b/codex-rs/core/src/tools/context.rs @@ -5,6 +5,7 @@ use crate::tools::TELEMETRY_PREVIEW_MAX_LINES; use crate::tools::TELEMETRY_PREVIEW_TRUNCATION_NOTICE; use crate::turn_diff_tracker::TurnDiffTracker; use codex_otel::otel_event_manager::OtelEventManager; +use codex_protocol::models::FunctionCallOutputContentItem; use codex_protocol::models::FunctionCallOutputPayload; use codex_protocol::models::ResponseInputItem; use codex_protocol::models::ShellToolCallParams; @@ -65,7 +66,10 @@ impl ToolPayload { #[derive(Clone)] pub enum ToolOutput { Function { + // Plain text representation of the tool output. content: String, + // Some tool calls such as MCP calls may return structured content that can get parsed into an array of polymorphic content items. + content_items: Option>, success: Option, }, Mcp { @@ -90,7 +94,11 @@ impl ToolOutput { pub fn into_response(self, call_id: &str, payload: &ToolPayload) -> ResponseInputItem { match self { - ToolOutput::Function { content, success } => { + ToolOutput::Function { + content, + content_items, + success, + } => { if matches!(payload, ToolPayload::Custom { .. }) { ResponseInputItem::CustomToolCallOutput { call_id: call_id.to_string(), @@ -99,7 +107,11 @@ impl ToolOutput { } else { ResponseInputItem::FunctionCallOutput { call_id: call_id.to_string(), - output: FunctionCallOutputPayload { content, success }, + output: FunctionCallOutputPayload { + content, + content_items, + success, + }, } } } @@ -163,6 +175,7 @@ mod tests { }; let response = ToolOutput::Function { content: "patched".to_string(), + content_items: None, success: Some(true), } .into_response("call-42", &payload); @@ -183,6 +196,7 @@ mod tests { }; let response = ToolOutput::Function { content: "ok".to_string(), + content_items: None, success: Some(true), } .into_response("fn-1", &payload); @@ -191,6 +205,7 @@ mod tests { ResponseInputItem::FunctionCallOutput { call_id, output } => { assert_eq!(call_id, "fn-1"); assert_eq!(output.content, "ok"); + assert!(output.content_items.is_none()); assert_eq!(output.success, Some(true)); } other => panic!("expected FunctionCallOutput, got {other:?}"), diff --git a/codex-rs/core/src/tools/handlers/apply_patch.rs b/codex-rs/core/src/tools/handlers/apply_patch.rs index 126b734242..1e82b9cf10 100644 --- a/codex-rs/core/src/tools/handlers/apply_patch.rs +++ b/codex-rs/core/src/tools/handlers/apply_patch.rs @@ -82,6 +82,7 @@ impl ToolHandler for ApplyPatchHandler { let content = item?; Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } @@ -126,6 +127,7 @@ impl ToolHandler for ApplyPatchHandler { let content = emitter.finish(event_ctx, out).await?; Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/grep_files.rs b/codex-rs/core/src/tools/handlers/grep_files.rs index de3cd3411c..5473f86935 100644 --- a/codex-rs/core/src/tools/handlers/grep_files.rs +++ b/codex-rs/core/src/tools/handlers/grep_files.rs @@ -90,11 +90,13 @@ impl ToolHandler for GrepFilesHandler { if search_results.is_empty() { Ok(ToolOutput::Function { content: "No matches found.".to_string(), + content_items: None, success: Some(false), }) } else { Ok(ToolOutput::Function { content: search_results.join("\n"), + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/list_dir.rs b/codex-rs/core/src/tools/handlers/list_dir.rs index bcea4a756d..1c08243f72 100644 --- a/codex-rs/core/src/tools/handlers/list_dir.rs +++ b/codex-rs/core/src/tools/handlers/list_dir.rs @@ -106,6 +106,7 @@ impl ToolHandler for ListDirHandler { output.extend(entries); Ok(ToolOutput::Function { content: output.join("\n"), + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/mcp.rs b/codex-rs/core/src/tools/handlers/mcp.rs index 4b2bf3b80e..9798fb8241 100644 --- a/codex-rs/core/src/tools/handlers/mcp.rs +++ b/codex-rs/core/src/tools/handlers/mcp.rs @@ -56,8 +56,16 @@ impl ToolHandler for McpHandler { Ok(ToolOutput::Mcp { result }) } codex_protocol::models::ResponseInputItem::FunctionCallOutput { output, .. } => { - let codex_protocol::models::FunctionCallOutputPayload { content, success } = output; - Ok(ToolOutput::Function { content, success }) + let codex_protocol::models::FunctionCallOutputPayload { + content, + content_items, + success, + } = output; + Ok(ToolOutput::Function { + content, + content_items, + success, + }) } _ => Err(FunctionCallError::RespondToModel( "mcp handler received unexpected response variant".to_string(), diff --git a/codex-rs/core/src/tools/handlers/mcp_resource.rs b/codex-rs/core/src/tools/handlers/mcp_resource.rs index be496f01ec..b601591ac1 100644 --- a/codex-rs/core/src/tools/handlers/mcp_resource.rs +++ b/codex-rs/core/src/tools/handlers/mcp_resource.rs @@ -297,7 +297,10 @@ async fn handle_list_resources( match payload_result { Ok(payload) => match serialize_function_output(payload) { Ok(output) => { - let ToolOutput::Function { content, success } = &output else { + let ToolOutput::Function { + content, success, .. + } = &output + else { unreachable!("MCP resource handler should return function output"); }; let duration = start.elapsed(); @@ -403,7 +406,10 @@ async fn handle_list_resource_templates( match payload_result { Ok(payload) => match serialize_function_output(payload) { Ok(output) => { - let ToolOutput::Function { content, success } = &output else { + let ToolOutput::Function { + content, success, .. + } = &output + else { unreachable!("MCP resource handler should return function output"); }; let duration = start.elapsed(); @@ -489,7 +495,10 @@ async fn handle_read_resource( match payload_result { Ok(payload) => match serialize_function_output(payload) { Ok(output) => { - let ToolOutput::Function { content, success } = &output else { + let ToolOutput::Function { + content, success, .. + } = &output + else { unreachable!("MCP resource handler should return function output"); }; let duration = start.elapsed(); @@ -618,6 +627,7 @@ where Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/plan.rs b/codex-rs/core/src/tools/handlers/plan.rs index ba8de6bef7..073319bf1c 100644 --- a/codex-rs/core/src/tools/handlers/plan.rs +++ b/codex-rs/core/src/tools/handlers/plan.rs @@ -88,6 +88,7 @@ impl ToolHandler for PlanHandler { Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/read_file.rs b/codex-rs/core/src/tools/handlers/read_file.rs index f9b6ae4dab..58b6ea6888 100644 --- a/codex-rs/core/src/tools/handlers/read_file.rs +++ b/codex-rs/core/src/tools/handlers/read_file.rs @@ -149,6 +149,7 @@ impl ToolHandler for ReadFileHandler { }; Ok(ToolOutput::Function { content: collected.join("\n"), + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/shell.rs b/codex-rs/core/src/tools/handlers/shell.rs index cab313077f..76650992b9 100644 --- a/codex-rs/core/src/tools/handlers/shell.rs +++ b/codex-rs/core/src/tools/handlers/shell.rs @@ -136,6 +136,7 @@ impl ShellHandler { let content = item?; return Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }); } @@ -179,6 +180,7 @@ impl ShellHandler { let content = emitter.finish(event_ctx, out).await?; return Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }); } @@ -226,6 +228,7 @@ impl ShellHandler { let content = emitter.finish(event_ctx, out).await?; Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/test_sync.rs b/codex-rs/core/src/tools/handlers/test_sync.rs index e340ab47f7..d217c1e8a6 100644 --- a/codex-rs/core/src/tools/handlers/test_sync.rs +++ b/codex-rs/core/src/tools/handlers/test_sync.rs @@ -95,6 +95,7 @@ impl ToolHandler for TestSyncHandler { Ok(ToolOutput::Function { content: "ok".to_string(), + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/unified_exec.rs b/codex-rs/core/src/tools/handlers/unified_exec.rs index 7d1102121e..32ace6c959 100644 --- a/codex-rs/core/src/tools/handlers/unified_exec.rs +++ b/codex-rs/core/src/tools/handlers/unified_exec.rs @@ -171,6 +171,7 @@ impl ToolHandler for UnifiedExecHandler { Ok(ToolOutput::Function { content, + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/handlers/view_image.rs b/codex-rs/core/src/tools/handlers/view_image.rs index b25642d803..6b308c0944 100644 --- a/codex-rs/core/src/tools/handlers/view_image.rs +++ b/codex-rs/core/src/tools/handlers/view_image.rs @@ -85,6 +85,7 @@ impl ToolHandler for ViewImageHandler { Ok(ToolOutput::Function { content: "attached local image path".to_string(), + content_items: None, success: Some(true), }) } diff --git a/codex-rs/core/src/tools/parallel.rs b/codex-rs/core/src/tools/parallel.rs index 449b8e6553..7340f5cc9c 100644 --- a/codex-rs/core/src/tools/parallel.rs +++ b/codex-rs/core/src/tools/parallel.rs @@ -105,7 +105,7 @@ impl ToolCallRuntime { call_id: call.call_id.clone(), output: FunctionCallOutputPayload { content: "aborted".to_string(), - success: None, + ..Default::default() }, }, } diff --git a/codex-rs/core/src/tools/router.rs b/codex-rs/core/src/tools/router.rs index 161997fb6c..19098aa80d 100644 --- a/codex-rs/core/src/tools/router.rs +++ b/codex-rs/core/src/tools/router.rs @@ -181,6 +181,7 @@ impl ToolRouter { output: codex_protocol::models::FunctionCallOutputPayload { content: message, success: Some(false), + ..Default::default() }, } } diff --git a/codex-rs/core/tests/suite/rmcp_client.rs b/codex-rs/core/tests/suite/rmcp_client.rs index 99c863b8f9..85698e9216 100644 --- a/codex-rs/core/tests/suite/rmcp_client.rs +++ b/codex-rs/core/tests/suite/rmcp_client.rs @@ -14,6 +14,8 @@ use codex_core::features::Feature; use codex_core::protocol::AskForApproval; use codex_core::protocol::EventMsg; +use codex_core::protocol::McpInvocation; +use codex_core::protocol::McpToolCallBeginEvent; use codex_core::protocol::Op; use codex_core::protocol::SandboxPolicy; use codex_protocol::config_types::ReasoningSummary; @@ -25,7 +27,9 @@ use core_test_support::test_codex::test_codex; use core_test_support::wait_for_event; use core_test_support::wait_for_event_with_timeout; use escargot::CargoBuild; +use mcp_types::ContentBlock; use serde_json::Value; +use serde_json::json; use serial_test::serial; use tempfile::tempdir; use tokio::net::TcpStream; @@ -35,6 +39,8 @@ use tokio::time::Instant; use tokio::time::sleep; use wiremock::matchers::any; +static OPENAI_PNG: &str = ""; + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] #[serial(mcp_test_value)] async fn stdio_server_round_trip() -> anyhow::Result<()> { @@ -175,6 +181,352 @@ async fn stdio_server_round_trip() -> anyhow::Result<()> { Ok(()) } +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[serial(mcp_test_value)] +async fn stdio_image_responses_round_trip() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + + let call_id = "img-1"; + let server_name = "rmcp"; + let tool_name = format!("mcp__{server_name}__image"); + + // First stream: model decides to call the image tool. + mount_sse_once_match( + &server, + any(), + responses::sse(vec![ + responses::ev_response_created("resp-1"), + responses::ev_function_call(call_id, &tool_name, "{}"), + responses::ev_completed("resp-1"), + ]), + ) + .await; + // Second stream: after tool execution, assistant emits a message and completes. + let final_mock = mount_sse_once_match( + &server, + any(), + responses::sse(vec![ + responses::ev_assistant_message("msg-1", "rmcp image tool completed successfully."), + responses::ev_completed("resp-2"), + ]), + ) + .await; + + // Build the stdio rmcp server and pass the image as data URL so it can construct ImageContent. + let rmcp_test_server_bin = CargoBuild::new() + .package("codex-rmcp-client") + .bin("test_stdio_server") + .run()? + .path() + .to_string_lossy() + .into_owned(); + + let fixture = test_codex() + .with_config(move |config| { + config.features.enable(Feature::RmcpClient); + config.mcp_servers.insert( + server_name.to_string(), + McpServerConfig { + transport: McpServerTransportConfig::Stdio { + command: rmcp_test_server_bin, + args: Vec::new(), + env: Some(HashMap::from([( + "MCP_TEST_IMAGE_DATA_URL".to_string(), + OPENAI_PNG.to_string(), + )])), + env_vars: Vec::new(), + cwd: None, + }, + enabled: true, + startup_timeout_sec: Some(Duration::from_secs(10)), + tool_timeout_sec: None, + enabled_tools: None, + disabled_tools: None, + }, + ); + }) + .build(&server) + .await?; + let session_model = fixture.session_configured.model.clone(); + + fixture + .codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "call the rmcp image tool".into(), + }], + final_output_json_schema: None, + cwd: fixture.cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::ReadOnly, + model: session_model, + effort: None, + summary: ReasoningSummary::Auto, + }) + .await?; + + // Wait for tool begin/end and final completion. + let begin_event = wait_for_event_with_timeout( + &fixture.codex, + |ev| matches!(ev, EventMsg::McpToolCallBegin(_)), + Duration::from_secs(10), + ) + .await; + let EventMsg::McpToolCallBegin(begin) = begin_event else { + unreachable!("begin"); + }; + assert_eq!( + begin, + McpToolCallBeginEvent { + call_id: call_id.to_string(), + invocation: McpInvocation { + server: server_name.to_string(), + tool: "image".to_string(), + arguments: Some(json!({})), + }, + }, + ); + + let end_event = wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallEnd(_)) + }) + .await; + let EventMsg::McpToolCallEnd(end) = end_event else { + unreachable!("end"); + }; + assert_eq!(end.call_id, call_id); + assert_eq!( + end.invocation, + McpInvocation { + server: server_name.to_string(), + tool: "image".to_string(), + arguments: Some(json!({})), + } + ); + let result = end.result.expect("rmcp image tool should return success"); + assert_eq!(result.is_error, Some(false)); + assert_eq!(result.content.len(), 1); + let base64_only = OPENAI_PNG + .strip_prefix("data:image/png;base64,") + .expect("data url prefix"); + match &result.content[0] { + ContentBlock::ImageContent(img) => { + assert_eq!(img.mime_type, "image/png"); + assert_eq!(img.r#type, "image"); + assert_eq!(img.data, base64_only); + } + other => panic!("expected image content, got {other:?}"), + } + + wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; + + let output_item = final_mock.single_request().function_call_output(call_id); + assert_eq!( + output_item, + json!({ + "type": "function_call_output", + "call_id": call_id, + "output": [{ + "type": "input_image", + "image_url": OPENAI_PNG + }] + }) + ); + server.verify().await; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 1)] +#[serial(mcp_test_value)] +async fn stdio_image_completions_round_trip() -> anyhow::Result<()> { + skip_if_no_network!(Ok(())); + + let server = responses::start_mock_server().await; + + let call_id = "img-cc-1"; + let server_name = "rmcp"; + let tool_name = format!("mcp__{server_name}__image"); + + let tool_call = json!({ + "choices": [ + { + "delta": { + "tool_calls": [ + { + "id": call_id, + "type": "function", + "function": {"name": tool_name, "arguments": "{}"} + } + ] + }, + "finish_reason": "tool_calls" + } + ] + }); + let sse_tool_call = format!( + "data: {}\n\ndata: [DONE]\n\n", + serde_json::to_string(&tool_call)? + ); + + let final_assistant = json!({ + "choices": [ + { + "delta": {"content": "rmcp image tool completed successfully."}, + "finish_reason": "stop" + } + ] + }); + let sse_final = format!( + "data: {}\n\ndata: [DONE]\n\n", + serde_json::to_string(&final_assistant)? + ); + + use std::sync::atomic::AtomicUsize; + use std::sync::atomic::Ordering; + struct ChatSeqResponder { + num_calls: AtomicUsize, + bodies: Vec, + } + impl wiremock::Respond for ChatSeqResponder { + fn respond(&self, _: &wiremock::Request) -> wiremock::ResponseTemplate { + let idx = self.num_calls.fetch_add(1, Ordering::SeqCst); + match self.bodies.get(idx) { + Some(body) => wiremock::ResponseTemplate::new(200) + .insert_header("content-type", "text/event-stream") + .set_body_string(body.clone()), + None => panic!("no chat completion response for index {idx}"), + } + } + } + + let chat_seq = ChatSeqResponder { + num_calls: AtomicUsize::new(0), + bodies: vec![sse_tool_call, sse_final], + }; + wiremock::Mock::given(wiremock::matchers::method("POST")) + .and(wiremock::matchers::path("/v1/chat/completions")) + .respond_with(chat_seq) + .expect(2) + .mount(&server) + .await; + + let rmcp_test_server_bin = CargoBuild::new() + .package("codex-rmcp-client") + .bin("test_stdio_server") + .run()? + .path() + .to_string_lossy() + .into_owned(); + + let fixture = test_codex() + .with_config(move |config| { + config.model_provider.wire_api = codex_core::WireApi::Chat; + config.features.enable(Feature::RmcpClient); + config.mcp_servers.insert( + server_name.to_string(), + McpServerConfig { + transport: McpServerTransportConfig::Stdio { + command: rmcp_test_server_bin, + args: Vec::new(), + env: Some(HashMap::from([( + "MCP_TEST_IMAGE_DATA_URL".to_string(), + OPENAI_PNG.to_string(), + )])), + env_vars: Vec::new(), + cwd: None, + }, + enabled: true, + startup_timeout_sec: Some(Duration::from_secs(10)), + tool_timeout_sec: None, + enabled_tools: None, + disabled_tools: None, + }, + ); + }) + .build(&server) + .await?; + let session_model = fixture.session_configured.model.clone(); + + fixture + .codex + .submit(Op::UserTurn { + items: vec![UserInput::Text { + text: "call the rmcp image tool".into(), + }], + final_output_json_schema: None, + cwd: fixture.cwd.path().to_path_buf(), + approval_policy: AskForApproval::Never, + sandbox_policy: SandboxPolicy::ReadOnly, + model: session_model, + effort: None, + summary: ReasoningSummary::Auto, + }) + .await?; + + let begin_event = wait_for_event_with_timeout( + &fixture.codex, + |ev| matches!(ev, EventMsg::McpToolCallBegin(_)), + Duration::from_secs(10), + ) + .await; + let EventMsg::McpToolCallBegin(begin) = begin_event else { + unreachable!("begin"); + }; + assert_eq!( + begin, + McpToolCallBeginEvent { + call_id: call_id.to_string(), + invocation: McpInvocation { + server: server_name.to_string(), + tool: "image".to_string(), + arguments: Some(json!({})), + }, + }, + ); + + let end_event = wait_for_event(&fixture.codex, |ev| { + matches!(ev, EventMsg::McpToolCallEnd(_)) + }) + .await; + let EventMsg::McpToolCallEnd(end) = end_event else { + unreachable!("end"); + }; + assert!(end.result.as_ref().is_ok(), "tool call should succeed"); + + wait_for_event(&fixture.codex, |ev| matches!(ev, EventMsg::TaskComplete(_))).await; + + // Chat Completions assertion: the second POST should include a tool role message + // with an array `content` containing an item with the expected data URL. + let requests = server.received_requests().await.expect("requests captured"); + assert!(requests.len() >= 2, "expected two chat completion calls"); + let second = &requests[1]; + let body: Value = serde_json::from_slice(&second.body)?; + let messages = body + .get("messages") + .and_then(Value::as_array) + .cloned() + .expect("messages array"); + let tool_msg = messages + .iter() + .find(|m| { + m.get("role") == Some(&json!("tool")) && m.get("tool_call_id") == Some(&json!(call_id)) + }) + .cloned() + .expect("tool message present"); + assert_eq!( + tool_msg, + json!({ + "role": "tool", + "tool_call_id": call_id, + "content": [{"type": "image_url", "image_url": {"url": OPENAI_PNG}}] + }) + ); + + Ok(()) +} + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] #[serial(mcp_test_value)] async fn stdio_server_propagates_whitelisted_env_vars() -> anyhow::Result<()> { diff --git a/codex-rs/protocol/src/models.rs b/codex-rs/protocol/src/models.rs index 4430a0998a..614a5ff2b0 100644 --- a/codex-rs/protocol/src/models.rs +++ b/codex-rs/protocol/src/models.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use base64::Engine; use codex_utils_image::load_and_resize_to_fit; use mcp_types::CallToolResult; +use mcp_types::ContentBlock; use serde::Deserialize; use serde::Deserializer; use serde::Serialize; @@ -83,9 +84,8 @@ pub enum ResponseItem { // NOTE: The input schema for `function_call_output` objects that clients send to the // OpenAI /v1/responses endpoint is NOT the same shape as the objects the server returns on the // SSE stream. When *sending* we must wrap the string output inside an object that includes a - // required `success` boolean. The upstream TypeScript CLI does this implicitly. To ensure we - // serialize exactly the expected shape we introduce a dedicated payload struct and flatten it - // here. + // required `success` boolean. To ensure we serialize exactly the expected shape we introduce + // a dedicated payload struct and flatten it here. FunctionCallOutput { call_id: String, output: FunctionCallOutputPayload, @@ -160,19 +160,17 @@ impl From for ResponseItem { ResponseInputItem::FunctionCallOutput { call_id, output } => { Self::FunctionCallOutput { call_id, output } } - ResponseInputItem::McpToolCallOutput { call_id, result } => Self::FunctionCallOutput { - call_id, - output: FunctionCallOutputPayload { - success: Some(result.is_ok()), - content: result.map_or_else( - |tool_call_err| format!("err: {tool_call_err:?}"), - |result| { - serde_json::to_string(&result) - .unwrap_or_else(|e| format!("JSON serialization error: {e}")) - }, - ), - }, - }, + ResponseInputItem::McpToolCallOutput { call_id, result } => { + let output = match result { + Ok(result) => FunctionCallOutputPayload::from(&result), + Err(tool_call_err) => FunctionCallOutputPayload { + content: format!("err: {tool_call_err:?}"), + success: Some(false), + ..Default::default() + }, + }; + Self::FunctionCallOutput { call_id, output } + } ResponseInputItem::CustomToolCallOutput { call_id, output } => { Self::CustomToolCallOutput { call_id, output } } @@ -290,31 +288,53 @@ pub struct ShellToolCallParams { pub justification: Option, } -#[derive(Debug, Clone, PartialEq, JsonSchema, TS)] +/// Responses API compatible content items that can be returned by a tool call. +/// This is a subset of ContentItem with the types we support as function call outputs. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, JsonSchema, TS)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum FunctionCallOutputContentItem { + // Do not rename, these are serialized and used directly in the responses API. + InputText { text: String }, + // Do not rename, these are serialized and used directly in the responses API. + InputImage { image_url: String }, +} + +/// The payload we send back to OpenAI when reporting a tool call result. +/// +/// `content` preserves the historical plain-string payload so downstream +/// integrations (tests, logging, etc.) can keep treating tool output as +/// `String`. When an MCP server returns richer data we additionally populate +/// `content_items` with the structured form that the Responses/Chat +/// Completions APIs understand. +#[derive(Debug, Default, Clone, PartialEq, JsonSchema, TS)] pub struct FunctionCallOutputPayload { pub content: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub content_items: Option>, // TODO(jif) drop this. pub success: Option, } +#[derive(Deserialize)] +#[serde(untagged)] +enum FunctionCallOutputPayloadSerde { + Text(String), + Items(Vec), +} + // The Responses API expects two *different* shapes depending on success vs failure: // • success → output is a plain string (no nested object) // • failure → output is an object { content, success:false } -// The upstream TypeScript CLI implements this by special‑casing the serialize path. -// We replicate that behavior with a manual Serialize impl. - impl Serialize for FunctionCallOutputPayload { fn serialize(&self, serializer: S) -> Result where S: Serializer, { - // The upstream TypeScript CLI always serializes `output` as a *plain string* regardless - // of whether the function call succeeded or failed. The boolean is purely informational - // for local bookkeeping and is NOT sent to the OpenAI endpoint. Sending the nested object - // form `{ content, success:false }` triggers the 400 we are still seeing. Mirror the JS CLI - // exactly: always emit a bare string. - - serializer.serialize_str(&self.content) + if let Some(items) = &self.content_items { + items.serialize(serializer) + } else { + serializer.serialize_str(&self.content) + } } } @@ -323,14 +343,106 @@ impl<'de> Deserialize<'de> for FunctionCallOutputPayload { where D: Deserializer<'de>, { - let s = String::deserialize(deserializer)?; - Ok(FunctionCallOutputPayload { - content: s, - success: None, - }) + match FunctionCallOutputPayloadSerde::deserialize(deserializer)? { + FunctionCallOutputPayloadSerde::Text(content) => Ok(FunctionCallOutputPayload { + content, + ..Default::default() + }), + FunctionCallOutputPayloadSerde::Items(items) => { + let content = serde_json::to_string(&items).map_err(serde::de::Error::custom)?; + Ok(FunctionCallOutputPayload { + content, + content_items: Some(items), + success: None, + }) + } + } + } +} + +impl From<&CallToolResult> for FunctionCallOutputPayload { + fn from(call_tool_result: &CallToolResult) -> Self { + let CallToolResult { + content, + structured_content, + is_error, + } = call_tool_result; + + let is_success = is_error != &Some(true); + + if let Some(structured_content) = structured_content + && !structured_content.is_null() + { + match serde_json::to_string(structured_content) { + Ok(serialized_structured_content) => { + return FunctionCallOutputPayload { + content: serialized_structured_content, + success: Some(is_success), + ..Default::default() + }; + } + Err(err) => { + return FunctionCallOutputPayload { + content: err.to_string(), + success: Some(false), + ..Default::default() + }; + } + } + } + + let serialized_content = match serde_json::to_string(content) { + Ok(serialized_content) => serialized_content, + Err(err) => { + return FunctionCallOutputPayload { + content: err.to_string(), + success: Some(false), + ..Default::default() + }; + } + }; + + let content_items = convert_content_blocks_to_items(content); + + FunctionCallOutputPayload { + content: serialized_content, + content_items, + success: Some(is_success), + } } } +fn convert_content_blocks_to_items( + blocks: &[ContentBlock], +) -> Option> { + let mut saw_image = false; + let mut items = Vec::with_capacity(blocks.len()); + + for block in blocks { + match block { + ContentBlock::TextContent(text) => { + items.push(FunctionCallOutputContentItem::InputText { + text: text.text.clone(), + }); + } + ContentBlock::ImageContent(image) => { + saw_image = true; + // Just in case the content doesn't include a data URL, add it. + let image_url = if image.data.starts_with("data:") { + image.data.clone() + } else { + format!("data:{};base64,{}", image.mime_type, image.data) + }; + items.push(FunctionCallOutputContentItem::InputImage { image_url }); + } + // TODO: render audio, resource, and embedded resource content to the model. + _ => return None, + } + } + + if saw_image { Some(items) } else { None } +} + // Implement Display so callers can treat the payload like a plain string when logging or doing // trivial substring checks in tests (existing tests call `.contains()` on the output). Display // returns the raw `content` field. @@ -354,6 +466,8 @@ impl std::ops::Deref for FunctionCallOutputPayload { mod tests { use super::*; use anyhow::Result; + use mcp_types::ImageContent; + use mcp_types::TextContent; use tempfile::tempdir; #[test] @@ -362,7 +476,7 @@ mod tests { call_id: "call1".into(), output: FunctionCallOutputPayload { content: "ok".into(), - success: None, + ..Default::default() }, }; @@ -381,6 +495,7 @@ mod tests { output: FunctionCallOutputPayload { content: "bad".into(), success: Some(false), + ..Default::default() }, }; @@ -391,6 +506,81 @@ mod tests { Ok(()) } + #[test] + fn serializes_image_outputs_as_array() -> Result<()> { + let call_tool_result = CallToolResult { + content: vec![ + ContentBlock::TextContent(TextContent { + annotations: None, + text: "caption".into(), + r#type: "text".into(), + }), + ContentBlock::ImageContent(ImageContent { + annotations: None, + data: "BASE64".into(), + mime_type: "image/png".into(), + r#type: "image".into(), + }), + ], + is_error: None, + structured_content: None, + }; + + let payload = FunctionCallOutputPayload::from(&call_tool_result); + assert_eq!(payload.success, Some(true)); + let items = payload.content_items.clone().expect("content items"); + assert_eq!( + items, + vec![ + FunctionCallOutputContentItem::InputText { + text: "caption".into(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "".into(), + }, + ] + ); + + let item = ResponseInputItem::FunctionCallOutput { + call_id: "call1".into(), + output: payload, + }; + + let json = serde_json::to_string(&item)?; + let v: serde_json::Value = serde_json::from_str(&json)?; + + let output = v.get("output").expect("output field"); + assert!(output.is_array(), "expected array output"); + + Ok(()) + } + + #[test] + fn deserializes_array_payload_into_items() -> Result<()> { + let json = r#"[ + {"type": "input_text", "text": "note"}, + {"type": "input_image", "image_url": ""} + ]"#; + + let payload: FunctionCallOutputPayload = serde_json::from_str(json)?; + + assert_eq!(payload.success, None); + let expected_items = vec![ + FunctionCallOutputContentItem::InputText { + text: "note".into(), + }, + FunctionCallOutputContentItem::InputImage { + image_url: "".into(), + }, + ]; + assert_eq!(payload.content_items, Some(expected_items.clone())); + + let expected_content = serde_json::to_string(&expected_items)?; + assert_eq!(payload.content, expected_content); + + Ok(()) + } + #[test] fn deserialize_shell_tool_call_params() -> Result<()> { let json = r#"{ diff --git a/codex-rs/protocol/src/protocol.rs b/codex-rs/protocol/src/protocol.rs index bb14f3797b..a7f5241bb3 100644 --- a/codex-rs/protocol/src/protocol.rs +++ b/codex-rs/protocol/src/protocol.rs @@ -814,7 +814,7 @@ pub struct AgentReasoningDeltaEvent { pub delta: String, } -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)] pub struct McpInvocation { /// Name of the MCP server as defined in the config. pub server: String, @@ -824,14 +824,14 @@ pub struct McpInvocation { pub arguments: Option, } -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)] pub struct McpToolCallBeginEvent { /// Identifier so this can be paired with the McpToolCallEnd event. pub call_id: String, pub invocation: McpInvocation, } -#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS)] +#[derive(Debug, Clone, Deserialize, Serialize, JsonSchema, TS, PartialEq)] pub struct McpToolCallEndEvent { /// Identifier for the corresponding McpToolCallBegin that finished. pub call_id: String, diff --git a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs index 44ae50f02f..aafba59324 100644 --- a/codex-rs/rmcp-client/src/bin/test_stdio_server.rs +++ b/codex-rs/rmcp-client/src/bin/test_stdio_server.rs @@ -40,7 +40,7 @@ pub fn stdio() -> (tokio::io::Stdin, tokio::io::Stdout) { } impl TestToolServer { fn new() -> Self { - let tools = vec![Self::echo_tool()]; + let tools = vec![Self::echo_tool(), Self::image_tool()]; let resources = vec![Self::memo_resource()]; let resource_templates = vec![Self::memo_template()]; Self { @@ -70,6 +70,22 @@ impl TestToolServer { ) } + fn image_tool() -> Tool { + #[expect(clippy::expect_used)] + let schema: JsonObject = serde_json::from_value(serde_json::json!({ + "type": "object", + "properties": {}, + "additionalProperties": false + })) + .expect("image tool schema should deserialize"); + + Tool::new( + Cow::Borrowed("image"), + Cow::Borrowed("Return a single image content block."), + Arc::new(schema), + ) + } + fn memo_resource() -> Resource { let raw = RawResource { uri: MEMO_URI.to_string(), @@ -214,6 +230,35 @@ impl ServerHandler for TestToolServer { meta: None, }) } + "image" => { + // Read a data URL (e.g. ...) from env and convert to + // an MCP image content block. Tests set MCP_TEST_IMAGE_DATA_URL. + let data_url = std::env::var("MCP_TEST_IMAGE_DATA_URL").map_err(|_| { + McpError::invalid_params( + "missing MCP_TEST_IMAGE_DATA_URL env var for image tool", + None, + ) + })?; + + fn parse_data_url(url: &str) -> Option<(String, String)> { + let rest = url.strip_prefix("data:")?; + let (mime_and_opts, data) = rest.split_once(',')?; + let (mime, _opts) = + mime_and_opts.split_once(';').unwrap_or((mime_and_opts, "")); + Some((mime.to_string(), data.to_string())) + } + + let (mime_type, data_b64) = parse_data_url(&data_url).ok_or_else(|| { + McpError::invalid_params( + format!("invalid data URL for image tool: {data_url}"), + None, + ) + })?; + + Ok(CallToolResult::success(vec![rmcp::model::Content::image( + data_b64, mime_type, + )])) + } other => Err(McpError::invalid_params( format!("unknown tool: {other}"), None, diff --git a/codex-rs/tui/src/history_cell.rs b/codex-rs/tui/src/history_cell.rs index e630e00175..406a892561 100644 --- a/codex-rs/tui/src/history_cell.rs +++ b/codex-rs/tui/src/history_cell.rs @@ -465,7 +465,7 @@ struct CompletedMcpToolCallWithImageOutput { } impl HistoryCell for CompletedMcpToolCallWithImageOutput { fn display_lines(&self, _width: u16) -> Vec> { - vec!["tool result (image output omitted)".into()] + vec!["tool result (image output)".into()] } }