Skip to content
Draft
113 changes: 111 additions & 2 deletions async-openai/src/types/realtime/client_event.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,76 @@
use serde::{Deserialize, Serialize};
use tokio_tungstenite::tungstenite::Message;

use super::{item::Item, session_resource::SessionResource};
use super::{
item::Item,
session_resource::{
AudioFormat, MaxResponseOutputTokens, Modality, RealtimeVoice, SessionResource, ToolChoice,
ToolDefinition,
},
};

/// Configuration for a response in the OpenAI Realtime API.
/// This is used in the `response.create` event.
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseConfig {
/// Controls which conversation the response is added to. Currently supports "auto" and "none",
/// with "auto" as the default value. The "auto" value means that the contents of the response
/// will be added to the default conversation. Set this to "none" to create an out-of-band response
/// which will not add items to default conversation.
#[serde(skip_serializing_if = "Option::is_none")]
pub conversation: Option<String>,

/// Input items to include in the prompt for the model. Using this field creates a new context
/// for this Response instead of using the default conversation. An empty array [] will clear
/// the context for this Response. Note that this can include references to items from the default conversation.
#[serde(skip_serializing_if = "Option::is_none")]
pub input: Option<Vec<Item>>,

/// The default system instructions (i.e. system message) prepended to model calls.
/// This field allows the client to guide the model on desired responses.
#[serde(skip_serializing_if = "Option::is_none")]
pub instructions: Option<String>,

/// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
/// Provide an integer between 1 and 4096 to limit output tokens, or "inf" for the maximum available tokens for a given model.
/// Defaults to "inf".
#[serde(skip_serializing_if = "Option::is_none")]
pub max_response_output_tokens: Option<MaxResponseOutputTokens>,

/// Set of 16 key-value pairs that can be attached to an object.
/// This can be useful for storing additional information about the object in a structured format.
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<std::collections::HashMap<String, String>>,

/// The set of modalities the model can respond with. To disable audio, set this to ["text"].
#[serde(skip_serializing_if = "Option::is_none")]
pub modalities: Option<Vec<Modality>>,

/// The format of output audio. Options are "pcm16", "g711_ulaw", or "g711_alaw".
#[serde(skip_serializing_if = "Option::is_none")]
pub output_audio_format: Option<AudioFormat>,

/// Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
#[serde(skip_serializing_if = "Option::is_none")]
pub temperature: Option<f32>,

/// How the model chooses tools.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<ToolChoice>,

/// Tools (functions) available to the model.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<ToolDefinition>>,

/// The voice the model uses to respond. Cannot be changed once the model has responded with audio at least once.
#[serde(skip_serializing_if = "Option::is_none")]
pub voice: Option<RealtimeVoice>,

/// The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed.
/// This value can only be changed in between model turns, not while a response is in progress.
#[serde(skip_serializing_if = "Option::is_none")]
pub speed: Option<f32>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct SessionUpdateEvent {
Expand Down Expand Up @@ -35,6 +104,13 @@ pub struct InputAudioBufferClearEvent {
pub event_id: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct OutputAudioBufferClearEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemCreateEvent {
/// Optional client-generated ID used to identify this event.
Expand Down Expand Up @@ -75,21 +151,34 @@ pub struct ConversationItemDeleteEvent {
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ConversationItemRetrieveEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// The ID of the item to retrieve.
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseCreateEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// Configuration for the response.
pub response: Option<SessionResource>,
pub response: Option<ResponseConfig>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Default)]
pub struct ResponseCancelEvent {
/// Optional client-generated ID used to identify this event.
#[serde(skip_serializing_if = "Option::is_none")]
pub event_id: Option<String>,

/// A specific response ID to cancel - if not provided, will cancel an in-progress response in the default conversation.
pub response_id: String,
}

/// These are events that the OpenAI Realtime WebSocket server will accept from the client.
Expand All @@ -112,6 +201,10 @@ pub enum ClientEvent {
#[serde(rename = "input_audio_buffer.clear")]
InputAudioBufferClear(InputAudioBufferClearEvent),

/// WebRTC Only: Send this event to cut off the current audio response.
#[serde(rename = "output_audio_buffer.clear")]
OutputAudioBufferClear(OutputAudioBufferClearEvent),

/// Send this event when adding an item to the conversation.
#[serde(rename = "conversation.item.create")]
ConversationItemCreate(ConversationItemCreateEvent),
Expand All @@ -124,6 +217,10 @@ pub enum ClientEvent {
#[serde(rename = "conversation.item.delete")]
ConversationItemDelete(ConversationItemDeleteEvent),

/// Send this event when you want to retrieve the server's representation of a specific item in the conversation history.
#[serde(rename = "conversation.item.retrieve")]
ConversationItemRetrieve(ConversationItemRetrieveEvent),

/// Send this event to trigger a response generation.
#[serde(rename = "response.create")]
ResponseCreate(ResponseCreateEvent),
Expand Down Expand Up @@ -181,6 +278,11 @@ event_from!(
ClientEvent,
InputAudioBufferClear
);
event_from!(
OutputAudioBufferClearEvent,
ClientEvent,
OutputAudioBufferClear
);
event_from!(
ConversationItemCreateEvent,
ClientEvent,
Expand All @@ -198,14 +300,21 @@ event_from!(
);
event_from!(ResponseCreateEvent, ClientEvent, ResponseCreate);
event_from!(ResponseCancelEvent, ClientEvent, ResponseCancel);
event_from!(
ConversationItemRetrieveEvent,
ClientEvent,
ConversationItemRetrieve
);

message_from_event!(SessionUpdateEvent, ClientEvent);
message_from_event!(InputAudioBufferAppendEvent, ClientEvent);
message_from_event!(InputAudioBufferCommitEvent, ClientEvent);
message_from_event!(InputAudioBufferClearEvent, ClientEvent);
message_from_event!(OutputAudioBufferClearEvent, ClientEvent);
message_from_event!(ConversationItemCreateEvent, ClientEvent);
message_from_event!(ConversationItemTruncateEvent, ClientEvent);
message_from_event!(ConversationItemDeleteEvent, ClientEvent);
message_from_event!(ConversationItemRetrieveEvent, ClientEvent);
message_from_event!(ResponseCreateEvent, ClientEvent);
message_from_event!(ResponseCancelEvent, ClientEvent);

Expand Down
1 change: 1 addition & 0 deletions async-openai/src/types/realtime/item.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pub enum ItemRole {
pub enum ItemContentType {
InputText,
InputAudio,
InputImage,
Text,
Audio,
}
Expand Down
21 changes: 20 additions & 1 deletion async-openai/src/types/realtime/response_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ pub enum ResponseStatus {

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct FailedError {
pub code: String,
#[serde(rename = "type")]
pub error_type: Option<String>,
pub code: Option<String>,
pub message: String,
}

Expand All @@ -31,6 +33,17 @@ pub enum IncompleteReason {
Interruption,
MaxOutputTokens,
ContentFilter,
TokenLimit,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum FinishReason {
Stop,
Length,
ToolCalls,
ContentFilter,
FunctionCall,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
Expand Down Expand Up @@ -58,4 +71,10 @@ pub struct ResponseResource {
pub output: Vec<Item>,
/// Usage statistics for the response.
pub usage: Option<Usage>,
/// The Unix timestamp (in seconds) for when the response was created.
#[serde(skip_serializing_if = "Option::is_none")]
pub created_at: Option<u64>,
/// The reason the model stopped generating tokens, if applicable.
#[serde(skip_serializing_if = "Option::is_none")]
pub finish_reason: Option<FinishReason>,
}
28 changes: 25 additions & 3 deletions async-openai/src/types/realtime/server_event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ pub struct ConversationCreatedEvent {
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct InputAudioBufferCommitedEvent {
pub struct InputAudioBufferCommittedEvent {
/// The unique ID of the server event.
pub event_id: String,
/// The ID of the preceding item after which the new item will be inserted.
pub previous_item_id: String,
pub previous_item_id: Option<String>,
/// The ID of the user message item that will be created.
pub item_id: String,
}
Expand All @@ -53,6 +53,12 @@ pub struct InputAudioBufferClearedEvent {
pub event_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct OutputAudioBufferClearedEvent {
/// The unique ID of the server event.
pub event_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct InputAudioBufferSpeechStartedEvent {
/// The unique ID of the server event.
Expand Down Expand Up @@ -154,6 +160,14 @@ pub struct ConversationItemDeletedEvent {
pub item_id: String,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ConversationItemRetrievedEvent {
/// The unique ID of the server event.
pub event_id: String,
/// The item that was retrieved.
pub item: Item,
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ResponseCreatedEvent {
/// The unique ID of the server event.
Expand Down Expand Up @@ -381,12 +395,16 @@ pub enum ServerEvent {

/// Returned when an input audio buffer is committed, either by the client or automatically in server VAD mode.
#[serde(rename = "input_audio_buffer.committed")]
InputAudioBufferCommited(InputAudioBufferCommitedEvent),
InputAudioBufferCommitted(InputAudioBufferCommittedEvent),

/// Returned when the input audio buffer is cleared by the client.
#[serde(rename = "input_audio_buffer.cleared")]
InputAudioBufferCleared(InputAudioBufferClearedEvent),

/// Returned when the output audio buffer is cleared by the client (WebRTC specific).
#[serde(rename = "output_audio_buffer.cleared")]
OutputAudioBufferCleared(OutputAudioBufferClearedEvent),

/// Returned in server turn detection mode when speech is detected.
#[serde(rename = "input_audio_buffer.speech_started")]
InputAudioBufferSpeechStarted(InputAudioBufferSpeechStartedEvent),
Expand Down Expand Up @@ -422,6 +440,10 @@ pub enum ServerEvent {
#[serde(rename = "conversation.item.deleted")]
ConversationItemDeleted(ConversationItemDeletedEvent),

/// Returned when an item in the conversation is retrieved.
#[serde(rename = "conversation.item.retrieved")]
ConversationItemRetrieved(ConversationItemRetrievedEvent),

/// Returned when a new Response is created. The first event of response creation, where the response is in an initial state of "in_progress".
#[serde(rename = "response.created")]
ResponseCreated(ResponseCreatedEvent),
Expand Down
Loading