64bit · codesoda · Jun 23, 2025 · Jun 23, 2025 · Jun 26, 2025 · Jun 30, 2025
diff --git a/async-openai/src/types/realtime/client_event.rs b/async-openai/src/types/realtime/client_event.rs
@@ -1,7 +1,76 @@
 use serde::{Deserialize, Serialize};
 use tokio_tungstenite::tungstenite::Message;
 
-use super::{item::Item, session_resource::SessionResource};
+use super::{
+    item::Item,
+    session_resource::{
+        AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice,
+        ToolDefinition,
+    },
+};
+
+/// Configuration for a response in the OpenAI Realtime API.
+/// This is used in the `response.create` event.
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ResponseConfig {
+    /// Controls which conversation the response is added to. Currently supports "auto" and "none",
+    /// with "auto" as the default value. The "auto" value means that the contents of the response
+    /// will be added to the default conversation. Set this to "none" to create an out-of-band response
+    /// which will not add items to default conversation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub conversation: Option<String>,
+
+    /// Input items to include in the prompt for the model. Using this field creates a new context
+    /// for this Response instead of using the default conversation. An empty array [] will clear
+    /// the context for this Response. Note that this can include references to items from the default conversation.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub input: Option<Vec<Item>>,
+
+    /// The default system instructions (i.e. system message) prepended to model calls.
+    /// This field allows the client to guide the model on desired responses.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub instructions: Option<String>,
+
+    /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
+    /// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
+    /// Defaults to "inf".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_response_output_tokens: Option<MaxResponseOutputTokens>,
+
+    /// Set of 16 key-value pairs that can be attached to an object.
+    /// This can be useful for storing additional information about the object in a structured format.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub metadata: Option<std::collections::HashMap<String, String>>,
+
+    /// The set of modalities the model can respond with. To disable audio, set this to ["text"].
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modalities: Option<Vec<Modality>>,
+
+    /// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub output_audio_format: Option<AudioFormat>,
+
+    /// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub temperature: Option<f32>,
+
+    /// How the model chooses tools.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_choice: Option<ToolChoice>,
+
+    /// Tools (functions) available to the model.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tools: Option<Vec<ToolDefinition>>,
+
+    /// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub voice: Option<RealtimeVoice>,
+
+    /// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
+    /// This value can only be changed in between model turns, not while a response is in progress.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub speed: Option<f32>,
+}
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct SessionUpdateEvent {
@@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent {
     pub event_id: Option<String>,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct OutputAudioBufferClearEvent {
+    /// Optional client-generated ID used to identify this event.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ConversationItemCreateEvent {
     /// Optional client-generated ID used to identify this event.
@@ -75,21 +151,34 @@ pub struct ConversationItemDeleteEvent {
     pub item_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone, Default)]
+pub struct ConversationItemRetrieveEvent {
+    /// Optional client-generated ID used to identify this event.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub event_id: Option<String>,
+
+    /// The ID of the item to retrieve.
+    pub item_id: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct ResponseCreateEvent {
     /// Optional client-generated ID used to identify this event.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub event_id: Option<String>,
 
     /// Configuration for the response.
-    pub response: Option<SessionResource>,
+    pub response: Option<ResponseConfig>,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone, Default)]
 pub struct ResponseCancelEvent {
     /// Optional client-generated ID used to identify this event.
     #[serde(skip_serializing_if = "Option::is_none")]
     pub event_id: Option<String>,
+
+    /// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation.
+    pub response_id: String,
 }
 
 /// These are events that the OpenAI Realtime WebSocket server will accept from the client.
@@ -112,6 +201,10 @@ pub enum ClientEvent {
     #[serde(rename = "input_audio_buffer.clear")]
     InputAudioBufferClear(InputAudioBufferClearEvent),
 
+    /// WebRTC Only: Send this event to cut off the current audio response.
+    #[serde(rename = "output_audio_buffer.clear")]
+    OutputAudioBufferClear(OutputAudioBufferClearEvent),
+
     /// Send this event when adding an item to the conversation.
     #[serde(rename = "conversation.item.create")]
     ConversationItemCreate(ConversationItemCreateEvent),
@@ -124,6 +217,10 @@ pub enum ClientEvent {
     #[serde(rename = "conversation.item.delete")]
     ConversationItemDelete(ConversationItemDeleteEvent),
 
+    /// Send this event when you want to retrieve the server's representation of a specific item in the conversation history.
+    #[serde(rename = "conversation.item.retrieve")]
+    ConversationItemRetrieve(ConversationItemRetrieveEvent),
+
     /// Send this event to trigger a response generation.
     #[serde(rename = "response.create")]
     ResponseCreate(ResponseCreateEvent),
@@ -181,6 +278,11 @@ event_from!(
     ClientEvent,
     InputAudioBufferClear
 );
+event_from!(
+    OutputAudioBufferClearEvent,
+    ClientEvent,
+    OutputAudioBufferClear
+);
 event_from!(
     ConversationItemCreateEvent,
     ClientEvent,
@@ -198,14 +300,21 @@ event_from!(
 );
 event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate);
 event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel);
+event_from!(
+    ConversationItemRetrieveEvent,
+    ClientEvent,
+    ConversationItemRetrieve
+);
 
 message_from_event!(SessionUpdateEvent, ClientEvent);
 message_from_event!(InputAudioBufferAppendEvent, ClientEvent);
 message_from_event!(InputAudioBufferCommitEvent, ClientEvent);
 message_from_event!(InputAudioBufferClearEvent, ClientEvent);
+message_from_event!(OutputAudioBufferClearEvent, ClientEvent);
 message_from_event!(ConversationItemCreateEvent, ClientEvent);
 message_from_event!(ConversationItemTruncateEvent, ClientEvent);
 message_from_event!(ConversationItemDeleteEvent, ClientEvent);
+message_from_event!(ConversationItemRetrieveEvent, ClientEvent);
 message_from_event!(ResponseCreateEvent, ClientEvent);
 message_from_event!(ResponseCancelEvent, ClientEvent);
 

diff --git a/async-openai/src/types/realtime/item.rs b/async-openai/src/types/realtime/item.rs
@@ -29,6 +29,7 @@ pub enum ItemRole {
 pub enum ItemContentType {
     InputText,
     InputAudio,
+    InputImage,
     Text,
     Audio,
 }

diff --git a/async-openai/src/types/realtime/response_resource.rs b/async-openai/src/types/realtime/response_resource.rs
@@ -21,7 +21,9 @@ pub enum ResponseStatus {
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct FailedError {
-    pub code: String,
+    #[serde(rename = "type")]
+    pub error_type: Option<String>,
+    pub code: Option<String>,
     pub message: String,
 }
 
@@ -31,6 +33,17 @@ pub enum IncompleteReason {
     Interruption,
     MaxOutputTokens,
     ContentFilter,
+    TokenLimit,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum FinishReason {
+    Stop,
+    Length,
+    ToolCalls,
+    ContentFilter,
+    FunctionCall,
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
@@ -58,4 +71,10 @@ pub struct ResponseResource {
     pub output: Vec<Item>,
     /// Usage statistics for the response.
     pub usage: Option<Usage>,
+    /// The Unix timestamp (in seconds) for when the response was created.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub created_at: Option<u64>,
+    /// The reason the model stopped generating tokens, if applicable.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub finish_reason: Option<FinishReason>,
 }
diff --git a/async-openai/src/types/realtime/server_event.rs b/async-openai/src/types/realtime/server_event.rs
@@ -38,11 +38,11 @@ pub struct ConversationCreatedEvent {
 }
 
 #[derive(Debug, Serialize, Deserialize, Clone)]
-pub struct InputAudioBufferCommitedEvent {
+pub struct InputAudioBufferCommittedEvent {
     /// The unique ID of the server event.
     pub event_id: String,
     /// The ID of the preceding item after which the new item will be inserted.
-    pub previous_item_id: String,
+    pub previous_item_id: Option<String>,
     /// The ID of the user message item that will be created.
     pub item_id: String,
 }
@@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent {
     pub event_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct OutputAudioBufferClearedEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct InputAudioBufferSpeechStartedEvent {
     /// The unique ID of the server event.
@@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent {
     pub item_id: String,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct ConversationItemRetrievedEvent {
+    /// The unique ID of the server event.
+    pub event_id: String,
+    /// The item that was retrieved.
+    pub item: Item,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct ResponseCreatedEvent {
     /// The unique ID of the server event.
@@ -381,12 +395,16 @@ pub enum ServerEvent {
 
     /// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode.
     #[serde(rename = "input_audio_buffer.committed")]
-    InputAudioBufferCommited(InputAudioBufferCommitedEvent),
+    InputAudioBufferCommitted(InputAudioBufferCommittedEvent),
 
     /// Returned when the input audio buffer is cleared by the client.
     #[serde(rename = "input_audio_buffer.cleared")]
     InputAudioBufferCleared(InputAudioBufferClearedEvent),
 
+    /// Returned when the output audio buffer is cleared by the client (WebRTC specific).
+    #[serde(rename = "output_audio_buffer.cleared")]
+    OutputAudioBufferCleared(OutputAudioBufferClearedEvent),
+
     /// Returned in server turn detection mode when speech is detected.
     #[serde(rename = "input_audio_buffer.speech_started")]
     InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent),
@@ -422,6 +440,10 @@ pub enum ServerEvent {
     #[serde(rename = "conversation.item.deleted")]
     ConversationItemDeleted(ConversationItemDeletedEvent),
 
+    /// Returned when an item in the conversation is retrieved.
+    #[serde(rename = "conversation.item.retrieved")]
+    ConversationItemRetrieved(ConversationItemRetrievedEvent),
+
     /// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress".
     #[serde(rename = "response.created")]
     ResponseCreated(ResponseCreatedEvent),