From 2e90177fa7f997c47c1250c2d431a30f487b72a3 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Fri, 24 Oct 2025 13:01:00 +0900 Subject: [PATCH 1/4] Support realtime GA models --- python/pyproject.toml | 2 +- ...time_agent_with_function_calling_webrtc.py | 1 + .../realtime/simple_realtime_chat_webrtc.py | 33 ++-- .../open_ai_realtime_execution_settings.py | 44 ++++++ .../ai/open_ai/services/_open_ai_realtime.py | 148 ++++++++++++------ .../ai/open_ai/services/azure_realtime.py | 2 +- .../ai/open_ai/settings/open_ai_settings.py | 2 +- .../open_ai/services/test_openai_realtime.py | 109 +++++++------ python/uv.lock | 8 +- 9 files changed, 241 insertions(+), 108 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index d1b22c85541a..2b38f3d9344b 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -37,7 +37,7 @@ dependencies = [ "numpy >= 1.25.0; python_version < '3.12'", "numpy >= 1.26.0; python_version >= '3.12'", # openai connector - "openai >= 1.98.0", + "openai >= 2.0.0", # openapi and swagger "openapi_core >= 0.18,<0.20", "websockets >= 13, < 16", diff --git a/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py index f17f69229c48..69333c79c463 100644 --- a/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py +++ b/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py @@ -103,6 +103,7 @@ async def main() -> None: flowery prose. """, voice="alloy", + output_modalities=["text", "audio"], turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8), function_choice_behavior=FunctionChoiceBehavior.Auto(), ) diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py index 5f32f4d949fa..901dd9d26c12 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py @@ -9,6 +9,7 @@ OpenAIRealtimeExecutionSettings, OpenAIRealtimeWebRTC, ) +from semantic_kernel.contents import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) utils_log = logging.getLogger("samples.concepts.realtime.utils") @@ -55,6 +56,8 @@ async def main() -> None: # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice # for more details. voice="alloy", + # Enable both text and audio output to get transcripts + output_modalities=["text", "audio"], ) realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings) # Create the settings for the session @@ -62,16 +65,26 @@ async def main() -> None: # the context manager calls the create_session method on the client and starts listening to the audio stream async with audio_player, realtime_client: async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback): - match event.event_type: - case "text": - # the model returns both audio and transcript of the audio, which we will print - print(event.text.text, end="") - case "service": - # OpenAI Specific events - if event.service_type == ListenEvents.SESSION_UPDATED: - print("Session updated") - if event.service_type == ListenEvents.RESPONSE_CREATED: - print("\nMosscap (transcript): ", end="") + match event: + case RealtimeTextEvent(): + # Only process delta events for streaming, skip done events to avoid duplication + if ( + hasattr(event, "service_type") + and "delta" in event.service_type + and hasattr(event.text, "text") + and event.text.text + ): + print(event.text.text, end="", flush=True) + # Add newline when transcript is complete (done event) + elif hasattr(event, "service_type") and "done" in event.service_type: + print() # Add newline for readability + case _: + # Handle other events including service events + if hasattr(event, "event_type") and event.event_type == "service": + if hasattr(event, "service_type") and event.service_type == ListenEvents.SESSION_UPDATED: + print("Session updated") + if hasattr(event, "service_type") and event.service_type == ListenEvents.RESPONSE_CREATED: + print("\nMosscap (transcript): ", end="") if __name__ == "__main__": diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index 2660187de902..24b03ab3b9ae 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -55,6 +55,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): """Request settings for OpenAI realtime services.""" modalities: Sequence[Literal["audio", "text"]] | None = None + output_modalities: Sequence[Literal["audio", "text"]] | None = None ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None instructions: str | None = None voice: str | None = None @@ -80,6 +81,49 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None + def prepare_settings_dict(self, **kwargs) -> dict[str, Any]: + """Prepare the settings as a dictionary for sending to the AI service. + + For realtime settings, we need to properly structure the audio configuration + to match the OpenAI API expectations where voice and turn_detection are nested + under the audio field. + """ + # Get the base settings dict (excludes service_id, extension_data, etc.) + settings_dict = super().prepare_settings_dict(**kwargs) + + # Build the audio configuration object + audio_config = {} + + # Handle voice (goes in audio.output.voice) + if "voice" in settings_dict: + audio_config.setdefault("output", {})["voice"] = settings_dict.pop("voice") + + # Handle turn_detection (goes in audio.input.turn_detection) + if "turn_detection" in settings_dict: + audio_config.setdefault("input", {})["turn_detection"] = settings_dict.pop("turn_detection") + + # Handle input audio format + if "input_audio_format" in settings_dict: + audio_config.setdefault("input", {})["format"] = settings_dict.pop("input_audio_format") + + # Handle output audio format + if "output_audio_format" in settings_dict: + audio_config.setdefault("output", {})["format"] = settings_dict.pop("output_audio_format") + + # Handle input audio transcription + if "input_audio_transcription" in settings_dict: + audio_config.setdefault("input", {})["transcription"] = settings_dict.pop("input_audio_transcription") + + # Handle input audio noise reduction + if "input_audio_noise_reduction" in settings_dict: + audio_config.setdefault("input", {})["noise_reduction"] = settings_dict.pop("input_audio_noise_reduction") + + # Add the audio config if it has any content + if audio_config: + settings_dict["audio"] = audio_config + + return settings_dict + class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings): """Request settings for Azure OpenAI realtime services.""" diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py index fc07450bf594..98653e103153 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py @@ -23,8 +23,8 @@ from av.audio.frame import AudioFrame from numpy import ndarray from openai._models import construct_type_unchecked -from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection -from openai.types.beta.realtime import ( +from openai.resources.realtime.realtime import AsyncRealtimeConnection +from openai.types.realtime import ( ConversationItemCreateEvent, ConversationItemDeleteEvent, ConversationItemTruncateEvent, @@ -32,13 +32,16 @@ InputAudioBufferClearEvent, InputAudioBufferCommitEvent, RealtimeClientEvent, + RealtimeConversationItemFunctionCall, + RealtimeConversationItemFunctionCallOutput, + RealtimeConversationItemUserMessage, + RealtimeResponseCreateParams, RealtimeServerEvent, ResponseCancelEvent, ResponseCreateEvent, ResponseFunctionCallArgumentsDoneEvent, SessionUpdateEvent, ) -from openai.types.beta.realtime.response_create_event import Response from pydantic import Field, PrivateAttr from semantic_kernel.connectors.ai.function_call_choice_configuration import FunctionCallChoiceConfiguration @@ -163,9 +166,15 @@ def _create_openai_realtime_client_event(event_type: SendEvents | str, **kwargs: case SendEvents.SESSION_UPDATE: if "session" not in kwargs: raise ContentException("Session is required for SessionUpdateEvent") + session_dict = kwargs.pop("session") + # Create proper RealtimeSessionCreateRequest with required type field for SDK validation + # The OpenAI SDK will handle the proper serialization for the API + from openai.types.realtime import RealtimeSessionCreateRequest + + session_request = RealtimeSessionCreateRequest(type="realtime", **session_dict) return SessionUpdateEvent( type=event_type.value, - session=kwargs.pop("session"), + session=session_request, **kwargs, ) case SendEvents.INPUT_AUDIO_BUFFER_APPEND: @@ -206,7 +215,9 @@ def _create_openai_realtime_client_event(event_type: SendEvents | str, **kwargs: ) case SendEvents.RESPONSE_CREATE: if "response" in kwargs: - response: Response | None = Response.model_validate(kwargs.pop("response")) + response: RealtimeResponseCreateParams | None = RealtimeResponseCreateParams.model_validate( + kwargs.pop("response") + ) else: response = None return ResponseCreateEvent( @@ -246,10 +257,10 @@ class ListenEvents(str, Enum): RESPONSE_CONTENT_PART_DONE = "response.content_part.done" RESPONSE_TEXT_DELTA = "response.text.delta" RESPONSE_TEXT_DONE = "response.text.done" - RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.audio_transcript.delta" - RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.audio_transcript.done" - RESPONSE_AUDIO_DELTA = "response.audio.delta" - RESPONSE_AUDIO_DONE = "response.audio.done" + RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.output_audio_transcript.delta" + RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.output_audio_transcript.done" + RESPONSE_AUDIO_DELTA = "response.output_audio.delta" + RESPONSE_AUDIO_DONE = "response.output_audio.done" RESPONSE_FUNCTION_CALL_ARGUMENTS_DELTA = "response.function_call_arguments.delta" RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE = "response.function_call_arguments.done" RATE_LIMITS_UPDATED = "rate_limits.updated" @@ -291,7 +302,7 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt might be of different types. """ match event.type: - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value: + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value | "response.audio_transcript.delta": yield RealtimeTextEvent( service_type=event.type, service_event=event, @@ -301,6 +312,15 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt choice_index=0, ), ) + case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value | "response.audio_transcript.done": + yield RealtimeTextEvent( + service_type=event.type, + service_event=event, + text=TextContent( + inner_content=event, + text=event.transcript, # type: ignore + ), + ) case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore @@ -323,7 +343,9 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt if parsed_event: yield parsed_event case ListenEvents.ERROR.value: - logger.error("Error received: %s", event.error.model_dump_json()) # type: ignore + # In GA API, event.error is a dict instead of an object + error_info = event.error if isinstance(event.error, dict) else event.error.model_dump() # type: ignore + logger.error("Error received: %s", error_info) # type: ignore yield RealtimeEvent(service_type=event.type, service_event=event) case ListenEvents.SESSION_CREATED.value | ListenEvents.SESSION_UPDATED.value: logger.info("Session created or updated, session: %s", event.session.model_dump_json()) # type: ignore @@ -483,43 +505,43 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - item={ - "type": "message", - "content": [ + item=RealtimeConversationItemUserMessage( + type="message", + content=[ { "type": "input_text", "text": event.text.text, } ], - "role": "user", - }, + role="user", + ), ) ) case RealtimeFunctionCallEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - item={ - "type": "function_call", - "name": event.function_call.name or event.function_call.function_name, - "arguments": "" + item=RealtimeConversationItemFunctionCall( + type="function_call", + name=event.function_call.name or event.function_call.function_name, + arguments="" if not event.function_call.arguments else event.function_call.arguments if isinstance(event.function_call.arguments, str) else json.dumps(event.function_call.arguments), - "call_id": event.function_call.metadata.get("call_id"), - }, + call_id=event.function_call.metadata.get("call_id"), + ), ) ) case RealtimeFunctionResultEvent(): await self._send( _create_openai_realtime_client_event( event_type=SendEvents.CONVERSATION_ITEM_CREATE, - item={ - "type": "function_call_output", - "output": event.function_result.result, - "call_id": event.function_result.metadata.get("call_id"), - }, + item=RealtimeConversationItemFunctionCallOutput( + type="function_call_output", + output=event.function_result.result, + call_id=event.function_result.metadata.get("call_id"), + ), ) ) case _: @@ -575,32 +597,32 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - item={ - "type": "message", - "content": [ + item=RealtimeConversationItemUserMessage( + type="message", + content=[ { "type": "input_text", "text": item.text, } ], - "role": "user", - }, + role="user", + ), ) ) case FunctionCallContent(): await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - item={ - "type": "function_call", - "name": item.name or item.function_name, - "arguments": "" + item=RealtimeConversationItemFunctionCall( + type="function_call", + name=item.name or item.function_name, + arguments="" if not item.arguments else item.arguments if isinstance(item.arguments, str) else json.dumps(item.arguments), - "call_id": item.metadata.get("call_id"), - }, + call_id=item.metadata.get("call_id"), + ), ) ) @@ -608,11 +630,11 @@ async def send(self, event: RealtimeEvents, **kwargs: Any) -> None: await self._send( _create_openai_realtime_client_event( event_type=event.service_type, - item={ - "type": "function_call_output", - "output": item.result, - "call_id": item.metadata.get("call_id"), - }, + item=RealtimeConversationItemFunctionCallOutput( + type="function_call_output", + output=item.result, + call_id=item.metadata.get("call_id"), + ), ) ) case SendEvents.CONVERSATION_ITEM_TRUNCATE: @@ -691,7 +713,36 @@ async def _send(self, event: RealtimeClientEvent) -> None: while self.data_channel.readyState != "open": await asyncio.sleep(0.1) try: - self.data_channel.send(event.model_dump_json(exclude_none=True)) + # Handle session update specially to exclude type field for WebRTC + if hasattr(event, "type") and event.type == "session.update": + event_dict = event.model_dump(exclude_none=True) + # Remove fields that aren't allowed in session updates for WebRTC compatibility + # Audio configuration should be set during session creation, not updates + session_dict = event_dict.get("session") + if session_dict and isinstance(session_dict, dict): + # Only keep fields that are allowed in session updates + # Note: output_modalities is not allowed in WebRTC session updates + allowed_fields = { + "instructions", + "model", + "max_output_tokens", + "tools", + "tool_choice", + "temperature", + "prompt", + "tracing", + "truncation", + } + event_dict["session"] = {k: v for k, v in session_dict.items() if k in allowed_fields} + + # Debug: Log what we're sending to see the structure + import json + + json_data = json.dumps(event_dict) + logger.debug(f"Sending WebRTC session.update: {json_data}") + self.data_channel.send(json_data) + else: + self.data_channel.send(event.model_dump_json(exclude_none=True)) except Exception as e: logger.error(f"Failed to send event {event} with error: {e!s}") @@ -834,11 +885,11 @@ def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]: return { "Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json", - }, f"{self.client.beta.realtime._client.base_url}/realtime/sessions" + }, f"{self.client.realtime._client.base_url}/realtime/sessions" def _get_webrtc_url(self) -> str: """Get the WebRTC URL.""" - return f"{self.client.beta.realtime._client.base_url}/realtime?model={self.ai_model_id}" + return f"{self.client.realtime._client.base_url}/realtime?model={self.ai_model_id}" # region Websocket @@ -882,6 +933,9 @@ async def _send(self, event: RealtimeClientEvent) -> None: if not self.connection: raise ValueError("Connection is not established.") try: + # Debug logging to see what we're actually sending + if hasattr(event, "type") and event.type == "session.update": + logger.debug(f"Sending session.update event: {event.model_dump()}") await self.connection.send(event) except Exception as e: logger.error(f"Error sending response: {e!s}") @@ -894,7 +948,7 @@ async def create_session( **kwargs: Any, ) -> None: """Create a session in the service.""" - self.connection = await self.client.beta.realtime.connect( + self.connection = await self.client.realtime.connect( model=self.ai_model_id, extra_headers={USER_AGENT: SEMANTIC_KERNEL_USER_AGENT} ).enter() self.connected.set() diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index 32a11f9ec807..fa45cf8eef97 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -253,7 +253,7 @@ def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]: def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]: """Get the headers and URL for the ephemeral token.""" url = ( - f"{self.client.beta.realtime._client.base_url}/realtimeapi/sessions?api-version=" + f"{self.client.realtime._client.base_url}/realtimeapi/sessions?api-version=" f"{self.client._api_version}" # type: ignore[attr-defined] ) if self.client._azure_ad_token is not None: # type: ignore[attr-defined] diff --git a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py index 6ead680aa4f4..ba92b93b0983 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/settings/open_ai_settings.py @@ -35,7 +35,7 @@ class OpenAISettings(KernelBaseSettings): - text_to_audio_model_id: str | None - The OpenAI text to audio model ID to use, for example, jukebox-1. (Env var OPENAI_TEXT_TO_AUDIO_MODEL_ID) - realtime_model_id: str | None - The OpenAI realtime model ID to use, - for example, gpt-4o-realtime-preview-2024-12-17. + for example, gpt-realtime, gpt-realtime-mini, or gpt-audio-mini. (Env var OPENAI_REALTIME_MODEL_ID) - env_file_path: str | None - if provided, the .env settings are read from this file path location """ diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py index ce05661d2832..3a7ad49c0732 100644 --- a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -9,28 +9,29 @@ from aiortc import AudioStreamTrack, RTCDataChannel, RTCPeerConnection from numpy import ndarray from openai import AsyncOpenAI -from openai.resources.beta.realtime.realtime import ( +from openai.resources.realtime.realtime import ( AsyncRealtimeConnection, AsyncRealtimeConnectionManager, ) -from openai.types.beta.realtime import ( - ConversationItem, - ConversationItemContent, +from openai.types.realtime import ( ConversationItemCreatedEvent, ConversationItemCreateEvent, ConversationItemDeletedEvent, ConversationItemDeleteEvent, ConversationItemTruncatedEvent, ConversationItemTruncateEvent, - ErrorEvent, InputAudioBufferAppendEvent, InputAudioBufferClearedEvent, InputAudioBufferClearEvent, InputAudioBufferCommitEvent, InputAudioBufferCommittedEvent, InputAudioBufferSpeechStartedEvent, + RealtimeConversationItemFunctionCall, + RealtimeConversationItemFunctionCallOutput, + RealtimeConversationItemUserMessage, RealtimeResponse, RealtimeServerEvent, + RealtimeSessionCreateRequest, ResponseAudioDeltaEvent, ResponseAudioDoneEvent, ResponseAudioTranscriptDeltaEvent, @@ -40,11 +41,11 @@ ResponseFunctionCallArgumentsDeltaEvent, ResponseFunctionCallArgumentsDoneEvent, ResponseOutputItemAddedEvent, - Session, SessionCreatedEvent, SessionUpdatedEvent, SessionUpdateEvent, ) +from openai.types.realtime.realtime_error import RealtimeError as ErrorEvent from pydantic import ValidationError from pytest import fixture, mark, param, raises @@ -85,28 +86,36 @@ from semantic_kernel.kernel import Kernel events = [ - SessionCreatedEvent(type=ListenEvents.SESSION_CREATED, session=Session(id="session_id"), event_id="1"), - SessionUpdatedEvent(type=ListenEvents.SESSION_UPDATED, session=Session(id="session_id"), event_id="2"), + SessionCreatedEvent( + type=ListenEvents.SESSION_CREATED.value, session=RealtimeSessionCreateRequest(type="realtime"), event_id="1" + ), + SessionUpdatedEvent( + type=ListenEvents.SESSION_UPDATED.value, session=RealtimeSessionCreateRequest(type="realtime"), event_id="2" + ), ConversationItemCreatedEvent( - type=ListenEvents.CONVERSATION_ITEM_CREATED, - item=ConversationItem(id="item_id"), + type=ListenEvents.CONVERSATION_ITEM_CREATED.value, + item=RealtimeConversationItemUserMessage(id="item_id", type="message", role="user", content=[]), event_id="3", previous_item_id="2", ), - ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED, item_id="item_id", event_id="4"), + ConversationItemDeletedEvent(type=ListenEvents.CONVERSATION_ITEM_DELETED.value, item_id="item_id", event_id="4"), ConversationItemTruncatedEvent( - type=ListenEvents.CONVERSATION_ITEM_TRUNCATED, event_id="5", audio_end_ms=0, content_index=0, item_id="item_id" + type=ListenEvents.CONVERSATION_ITEM_TRUNCATED.value, + event_id="5", + audio_end_ms=0, + content_index=0, + item_id="item_id", ), - InputAudioBufferClearedEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED, event_id="7"), + InputAudioBufferClearedEvent(type=ListenEvents.INPUT_AUDIO_BUFFER_CLEARED.value, event_id="7"), InputAudioBufferCommittedEvent( - type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED, + type=ListenEvents.INPUT_AUDIO_BUFFER_COMMITTED.value, event_id="8", item_id="item_id", previous_item_id="previous_item_id", ), - ResponseCreatedEvent(type=ListenEvents.RESPONSE_CREATED, event_id="10", response=RealtimeResponse()), + ResponseCreatedEvent(type=ListenEvents.RESPONSE_CREATED.value, event_id="10", response=RealtimeResponse()), ResponseFunctionCallArgumentsDoneEvent( - type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE, + type=ListenEvents.RESPONSE_FUNCTION_CALL_ARGUMENTS_DONE.value, event_id="11", arguments="{}", call_id="call_id", @@ -115,7 +124,7 @@ response_id="response_id", ), ResponseAudioTranscriptDeltaEvent( - type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA, + type=ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value, event_id="12", content_index=0, delta="text", @@ -124,7 +133,7 @@ response_id="response_id", ), ResponseAudioDoneEvent( - type=ListenEvents.RESPONSE_AUDIO_DONE, + type=ListenEvents.RESPONSE_AUDIO_DONE.value, event_id="13", item_id="item_id", output_index=0, @@ -132,7 +141,7 @@ content_index=0, ), ResponseAudioDeltaEvent( - type=ListenEvents.RESPONSE_AUDIO_DELTA, + type=ListenEvents.RESPONSE_AUDIO_DELTA.value, event_id="14", item_id="item_id", output_index=0, @@ -253,7 +262,7 @@ def test_openai_realtime_webrtc(openai_unit_test_env, audio_track): { "event_id": "event_id", "previous_item_id": "previous_item_id", - "item": {"id": "item_id"}, + "item": RealtimeConversationItemUserMessage(id="item_id", type="message", role="user", content=[]), }, ConversationItemCreateEvent, None, @@ -325,14 +334,14 @@ def test_create_openai_realtime_event( event_id="event_id", output_index=0, response_id="response_id", - type="response.audio_transcript.delta", + type="response.output_audio_transcript.delta", ), [RealtimeTextEvent], id="response_audio_transcript_delta", ), param( ResponseOutputItemAddedEvent( - item=ConversationItem(id="item_id"), + item=RealtimeConversationItemUserMessage(id="item_id", type="message", role="user", content=[]), event_id="event_id", output_index=0, response_id="response_id", @@ -343,7 +352,9 @@ def test_create_openai_realtime_event( ), param( ResponseOutputItemAddedEvent( - item=ConversationItem(id="item_id", type="function_call", call_id="call_id", name="function_to_call"), + item=RealtimeConversationItemFunctionCall( + id="item_id", type="function_call", call_id="call_id", name="function_to_call", arguments="" + ), event_id="event_id", output_index=0, response_id="response_id", @@ -382,6 +393,7 @@ def test_create_openai_realtime_event( ErrorEvent( error={"code": "error_code", "message": "error_message", "type": "invalid_request_error"}, event_id="event_id", + message="error_message", type="error", ), [RealtimeEvent], @@ -389,7 +401,7 @@ def test_create_openai_realtime_event( ), param( SessionCreatedEvent( - session=Session(id="session_id"), + session=RealtimeSessionCreateRequest(type="realtime"), event_id="event_id", type="session.created", ), @@ -398,7 +410,7 @@ def test_create_openai_realtime_event( ), param( SessionUpdatedEvent( - session=Session(id="session_id"), + session=RealtimeSessionCreateRequest(type="realtime"), event_id="event_id", type="session.updated", ), @@ -432,14 +444,24 @@ async def test_update_session(OpenAIWebsocket, kernel): role="assistant", items=[ FunctionCallContent( - function_name="function_name", plugin_name="plugin", arguments={"arg1": "value"}, id="1" + function_name="function_name", + plugin_name="plugin", + arguments={"arg1": "value"}, + id="1", + metadata={"call_id": "call_1"}, ) ], ), ChatMessageContent( role="tool", items=[ - FunctionResultContent(function_name="function_name", plugin_name="plugin", result="result", id="1") + FunctionResultContent( + function_name="function_name", + plugin_name="plugin", + result="result", + id="1", + metadata={"call_id": "call_1"}, + ) ], ), ChatMessageContent( @@ -451,7 +473,7 @@ async def test_update_session(OpenAIWebsocket, kernel): ), ] ) - settings = OpenAIRealtimeExecutionSettings(instructions="instructions", ai_model_id="gpt-4o-realtime-preview") + settings = OpenAIRealtimeExecutionSettings(instructions="instructions", ai_model_id="gpt-realtime") with patch.object(OpenAIWebsocket, "_send") as mock_send: await OpenAIWebsocket.update_session( chat_history=chat_history, settings=settings, create_response=True, kernel=kernel @@ -477,7 +499,7 @@ async def test_parse_function_call_arguments_done(OpenAIWebsocket, kernel): ) response_events = [RealtimeFunctionCallEvent, RealtimeFunctionResultEvent] OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( - instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + instructions="instructions", ai_model_id="gpt-realtime" ) OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() OpenAIWebsocket._call_id_to_function_map["call_id"] = "plugin_name-function_name" @@ -494,7 +516,7 @@ async def test_parse_function_call_arguments_done(OpenAIWebsocket, kernel): mock_send.assert_any_await( ConversationItemCreateEvent( type="conversation.item.create", - item=ConversationItem( + item=RealtimeConversationItemFunctionCallOutput( type="function_call_output", output=func_result, call_id="call_id", @@ -516,7 +538,7 @@ async def test_parse_function_call_arguments_done_fail(OpenAIWebsocket, kernel): ) response_events = [RealtimeEvent] OpenAIWebsocket._current_settings = OpenAIRealtimeExecutionSettings( - instructions="instructions", ai_model_id="gpt-4o-realtime-preview" + instructions="instructions", ai_model_id="gpt-realtime" ) OpenAIWebsocket._current_settings.function_choice_behavior = FunctionChoiceBehavior.Auto() # This function name is invalid @@ -549,7 +571,7 @@ async def test_send_audio(OpenAIWebsocket): @mark.parametrize("client", ["OpenAIWebRTC", "OpenAIWebsocket"]) async def test_send_session_update(client, OpenAIWebRTC, OpenAIWebsocket): openai_client = OpenAIWebRTC if client == "OpenAIWebRTC" else OpenAIWebsocket - settings = PromptExecutionSettings(ai_model_id="gpt-4o-realtime-preview") + settings = PromptExecutionSettings(ai_model_id="gpt-realtime") session_event = RealtimeEvent( service_type=SendEvents.SESSION_UPDATE, service_event={"settings": settings}, @@ -560,7 +582,7 @@ async def test_send_session_update(client, OpenAIWebRTC, OpenAIWebsocket): assert len(mock_send.await_args_list) == 1 mock_send.assert_any_await( SessionUpdateEvent( - session={"model": "gpt-4o-realtime-preview"}, + session={"model": "gpt-realtime", "type": "realtime"}, type="session.update", ) ) @@ -601,8 +623,8 @@ async def test_send_conversation_item_create(client, OpenAIWebRTC, OpenAIWebsock assert len(mock_send.await_args_list) == 3 mock_send.assert_any_await( ConversationItemCreateEvent( - item=ConversationItem( - content=[ConversationItemContent(text="Hello", type="input_text")], + item=RealtimeConversationItemUserMessage( + content=[{"text": "Hello", "type": "input_text"}], role="user", type="message", ), @@ -611,7 +633,7 @@ async def test_send_conversation_item_create(client, OpenAIWebRTC, OpenAIWebsock ) mock_send.assert_any_await( ConversationItemCreateEvent( - item=ConversationItem( + item=RealtimeConversationItemFunctionCall( arguments='{"arg1": "value"}', call_id="call_id", name="plugin-function_name", @@ -622,7 +644,7 @@ async def test_send_conversation_item_create(client, OpenAIWebRTC, OpenAIWebsock ) mock_send.assert_any_await( ConversationItemCreateEvent( - item=ConversationItem( + item=RealtimeConversationItemFunctionCallOutput( call_id="call_id", output="result", type="function_call_output", @@ -639,7 +661,7 @@ async def test_receive_websocket(OpenAIWebsocket): manager = AsyncMock(spec=AsyncRealtimeConnectionManager) manager.enter.return_value = connection_mock - with patch("openai.resources.beta.realtime.realtime.AsyncRealtime.connect") as mock_connect: + with patch("openai.resources.realtime.realtime.AsyncRealtime.connect") as mock_connect: mock_connect.return_value = manager async with OpenAIWebsocket(): async for msg in OpenAIWebsocket.receive(): @@ -674,7 +696,7 @@ async def openai_realtime_base(): return OpenAIRealtimeWebRTC( audio_track=audio_track_mock, client=async_openai_mock, - ai_model_id="gpt-4o-realtime-preview", + ai_model_id="gpt-realtime", kernel=kernel_mock, ) @@ -810,7 +832,7 @@ def mocked_open_ai_realtime_webrtc(mocked_audio_track, mocked_audio_output_callb return OpenAIRealtimeWebRTC( audio_track=mocked_audio_track, audio_output_callback=mocked_audio_output_callback, - ai_model_id="gpt-4o-realtime-preview", + ai_model_id="gpt-realtime", client=async_openai_mock, api_key="fake-api-key", ) @@ -870,10 +892,9 @@ async def test_create_session_initializes_peer_connection(mock_post, mocked_open mocked_open_ai_realtime_webrtc._get_ephemeral_token = AsyncMock(return_value="fake-token") mocked_open_ai_realtime_webrtc.client = AsyncMock(spec=AsyncOpenAI) mocked_open_ai_realtime_webrtc.client.api_key = "fake-api-key" - mocked_open_ai_realtime_webrtc.client.beta = AsyncMock() - mocked_open_ai_realtime_webrtc.client.beta.realtime = AsyncMock() - mocked_open_ai_realtime_webrtc.client.beta.realtime._client = AsyncMock() - mocked_open_ai_realtime_webrtc.client.beta.realtime._client.base_url = "https://api.openai.com" + mocked_open_ai_realtime_webrtc.client.realtime = AsyncMock() + mocked_open_ai_realtime_webrtc.client.realtime._client = AsyncMock() + mocked_open_ai_realtime_webrtc.client.realtime._client.base_url = "https://api.openai.com" await mocked_open_ai_realtime_webrtc.create_session() assert mocked_open_ai_realtime_webrtc.peer_connection is not None diff --git a/python/uv.lock b/python/uv.lock index e06bb5906896..78ffbae03ce4 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -3547,7 +3547,7 @@ wheels = [ [[package]] name = "openai" -version = "1.99.9" +version = "2.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyio", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, @@ -3559,9 +3559,9 @@ dependencies = [ { name = "tqdm", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux' or sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8a/d2/ef89c6f3f36b13b06e271d3cc984ddd2f62508a0972c1cbcc8485a6644ff/openai-1.99.9.tar.gz", hash = "sha256:f2082d155b1ad22e83247c3de3958eb4255b20ccf4a1de2e6681b6957b554e92", size = 506992, upload-time = "2025-08-12T02:31:10.054Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/c7/e42bcd89dfd47fec8a30b9e20f93e512efdbfbb3391b05bbb79a2fb295fa/openai-2.6.0.tar.gz", hash = "sha256:f119faf7fc07d7e558c1e7c32c873e241439b01bd7480418234291ee8c8f4b9d", size = 592904, upload-time = "2025-10-20T17:17:24.588Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/fb/df274ca10698ee77b07bff952f302ea627cc12dac6b85289485dd77db6de/openai-1.99.9-py3-none-any.whl", hash = "sha256:9dbcdb425553bae1ac5d947147bebbd630d91bbfc7788394d4c4f3a35682ab3a", size = 786816, upload-time = "2025-08-12T02:31:08.34Z" }, + { url = "https://files.pythonhosted.org/packages/c0/0a/58e9dcd34abe273eaeac3807a8483073767b5609d01bb78ea2f048e515a0/openai-2.6.0-py3-none-any.whl", hash = "sha256:f33fa12070fe347b5787a7861c8dd397786a4a17e1c3186e239338dac7e2e743", size = 1005403, upload-time = "2025-10-20T17:17:22.091Z" }, ] [[package]] @@ -6048,7 +6048,7 @@ requires-dist = [ { name = "numpy", marker = "python_full_version >= '3.12'", specifier = ">=1.26.0" }, { name = "ollama", marker = "extra == 'ollama'", specifier = "~=0.4" }, { name = "onnxruntime-genai", marker = "extra == 'onnx'", specifier = "~=0.7" }, - { name = "openai", specifier = ">=1.98.0" }, + { name = "openai", specifier = ">=2.0.0" }, { name = "openapi-core", specifier = ">=0.18,<0.20" }, { name = "opentelemetry-api", specifier = "~=1.24" }, { name = "opentelemetry-sdk", specifier = "~=1.24" }, From 1442dbb9669bf68ae37005c2d7686a22eb4c23d6 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Fri, 24 Oct 2025 13:08:21 +0900 Subject: [PATCH 2/4] Typing fix --- .../open_ai_realtime_execution_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index 24b03ab3b9ae..4d0647de10c2 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -92,7 +92,7 @@ def prepare_settings_dict(self, **kwargs) -> dict[str, Any]: settings_dict = super().prepare_settings_dict(**kwargs) # Build the audio configuration object - audio_config = {} + audio_config: dict[str, Any] = {} # Handle voice (goes in audio.output.voice) if "voice" in settings_dict: From 95bd8a8aaa1695903fc942901b67599a568131f6 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Fri, 24 Oct 2025 13:12:03 +0900 Subject: [PATCH 3/4] cleanup sample --- .../realtime/simple_realtime_chat_webrtc.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py index 901dd9d26c12..4f007ea4342e 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py @@ -68,22 +68,17 @@ async def main() -> None: match event: case RealtimeTextEvent(): # Only process delta events for streaming, skip done events to avoid duplication - if ( - hasattr(event, "service_type") - and "delta" in event.service_type - and hasattr(event.text, "text") - and event.text.text - ): + if event.service_type and "delta" in event.service_type and event.text.text: print(event.text.text, end="", flush=True) # Add newline when transcript is complete (done event) - elif hasattr(event, "service_type") and "done" in event.service_type: + elif event.service_type and "done" in event.service_type: print() # Add newline for readability case _: - # Handle other events including service events - if hasattr(event, "event_type") and event.event_type == "service": - if hasattr(event, "service_type") and event.service_type == ListenEvents.SESSION_UPDATED: + # Handle service events + if event.event_type == "service" and event.service_type: + if event.service_type == ListenEvents.SESSION_UPDATED: print("Session updated") - if hasattr(event, "service_type") and event.service_type == ListenEvents.RESPONSE_CREATED: + elif event.service_type == ListenEvents.RESPONSE_CREATED: print("\nMosscap (transcript): ", end="") From 7104e3b715f286080428e91991b1dd9bb4df3623 Mon Sep 17 00:00:00 2001 From: Evan Mattson Date: Fri, 27 Feb 2026 12:41:00 +0900 Subject: [PATCH 4/4] Realtime GA models working --- python/samples/concepts/realtime/README.md | 2 +- ...time_agent_with_function_calling_webrtc.py | 8 +- ...e_agent_with_function_calling_websocket.py | 3 + .../realtime/simple_realtime_chat_webrtc.py | 15 +- .../simple_realtime_chat_websocket.py | 10 +- python/samples/concepts/realtime/utils.py | 1 + .../connectors/ai/open_ai/const.py | 2 +- .../open_ai_realtime_execution_settings.py | 4 +- .../ai/open_ai/services/_open_ai_realtime.py | 69 ++++---- .../ai/open_ai/services/azure_realtime.py | 154 +++++++++++++++--- .../open_ai/services/test_openai_realtime.py | 4 + 11 files changed, 201 insertions(+), 71 deletions(-) diff --git a/python/samples/concepts/realtime/README.md b/python/samples/concepts/realtime/README.md index bd717938ed81..37c2f657f921 100644 --- a/python/samples/concepts/realtime/README.md +++ b/python/samples/concepts/realtime/README.md @@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T To run these samples, you will need to have the following setup: - Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set. -- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`. +- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2025-08-28`. - To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment: - semantic-kernel[realtime] - pyaudio diff --git a/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py b/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py index 69333c79c463..e11bc028c05e 100644 --- a/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py +++ b/python/samples/concepts/realtime/realtime_agent_with_function_calling_webrtc.py @@ -5,8 +5,6 @@ from datetime import datetime from random import randint -from azure.identity import AzureCliCredential - from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices from semantic_kernel.connectors.ai import FunctionChoiceBehavior from semantic_kernel.connectors.ai.open_ai import ( @@ -81,8 +79,12 @@ async def main() -> None: # and can also be passed in the receive method # You can also pass in kernel, plugins, chat_history or settings here. # For WebRTC the audio_track is required + + # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28" + # for Azure OpenAI deployments realtime deployments. realtime_agent = AzureRealtimeWebRTC( - audio_track=AudioRecorderWebRTC(), region="swedencentral", plugins=[Helpers()], credential=AzureCliCredential() + audio_track=AudioRecorderWebRTC(), + plugins=[Helpers()], ) # Create the settings for the session diff --git a/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py b/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py index 9031a4193dfe..ef853dbb6e09 100644 --- a/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py +++ b/python/samples/concepts/realtime/realtime_agent_with_function_calling_websocket.py @@ -82,6 +82,9 @@ async def main() -> None: # to signal the end of the user's turn and start the response. # manual VAD is not part of this sample # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection + + # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28" + # for Azure OpenAI deployments realtime deployments. settings = AzureRealtimeExecutionSettings( instructions=""" You are a chat bot. Your name is Mosscap and diff --git a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py index 4f007ea4342e..6e2f0f746ead 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_webrtc.py @@ -5,10 +5,10 @@ from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( + AzureRealtimeExecutionSettings, ListenEvents, - OpenAIRealtimeExecutionSettings, - OpenAIRealtimeWebRTC, ) +from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC from semantic_kernel.contents import RealtimeTextEvent logging.basicConfig(level=logging.WARNING) @@ -43,7 +43,7 @@ async def main() -> None: # create the realtime client and optionally add the audio output function, this is optional # you can define the protocol to use, either "websocket" or "webrtc" # they will behave the same way, even though the underlying protocol is quite different - settings = OpenAIRealtimeExecutionSettings( + settings = AzureRealtimeExecutionSettings( instructions=""" You are a chat bot. Your name is Mosscap and you have one goal: figure out what people need. @@ -59,7 +59,12 @@ async def main() -> None: # Enable both text and audio output to get transcripts output_modalities=["text", "audio"], ) - realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings) + # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28" + # for Azure OpenAI deployments realtime deployments. + realtime_client = AzureRealtimeWebRTC( + audio_track=AudioRecorderWebRTC(), + settings=settings, + ) # Create the settings for the session audio_player = AudioPlayerWebRTC() # the context manager calls the create_session method on the client and starts listening to the audio stream @@ -84,7 +89,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: start speaking. " + "Instructions: start speaking when you see 'Session updated.' " "The model will detect when you stop and automatically start responding. " "Press ctrl + c to stop the program." ) diff --git a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py index 412703f24e4f..0aa59acc1258 100644 --- a/python/samples/concepts/realtime/simple_realtime_chat_websocket.py +++ b/python/samples/concepts/realtime/simple_realtime_chat_websocket.py @@ -3,8 +3,6 @@ import asyncio import logging -from azure.identity import AzureCliCredential - from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices from semantic_kernel.connectors.ai.open_ai import ( AzureRealtimeExecutionSettings, @@ -59,7 +57,11 @@ async def main() -> None: # for more details. voice="shimmer", ) - realtime_client = AzureRealtimeWebsocket(settings=settings, credential=AzureCliCredential()) + # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28" + # for Azure OpenAI deployments realtime deployments. + realtime_client = AzureRealtimeWebsocket( + settings=settings, + ) audio_player = AudioPlayerWebsocket() audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client) # Create the settings for the session @@ -84,7 +86,7 @@ async def main() -> None: if __name__ == "__main__": print( - "Instructions: Start speaking. " + "Instructions: Start speaking when you see 'Session updated.' " "The model will detect when you stop and automatically start responding. " "Press ctrl + c to stop the program." ) diff --git a/python/samples/concepts/realtime/utils.py b/python/samples/concepts/realtime/utils.py index 9c8d4491c734..6943eaddcfb0 100644 --- a/python/samples/concepts/realtime/utils.py +++ b/python/samples/concepts/realtime/utils.py @@ -321,6 +321,7 @@ def _sounddevice_callback(self, outdata, frames, time, status): logger.debug(f"Audio output status: {status}") if self._queue: if self._queue.empty(): + outdata[:] = 0 return data = self._queue.get_nowait() outdata[:] = data.reshape(outdata.shape) diff --git a/python/semantic_kernel/connectors/ai/open_ai/const.py b/python/semantic_kernel/connectors/ai/open_ai/const.py index 0f39c75fd593..3c78fee15d2b 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/const.py +++ b/python/semantic_kernel/connectors/ai/open_ai/const.py @@ -2,4 +2,4 @@ from typing import Final -DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21" +DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28" diff --git a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py index 4d0647de10c2..3c45a9aac477 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py +++ b/python/semantic_kernel/connectors/ai/open_ai/prompt_execution_settings/open_ai_realtime_execution_settings.py @@ -54,7 +54,6 @@ class TurnDetection(KernelBaseModel): class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): """Request settings for OpenAI realtime services.""" - modalities: Sequence[Literal["audio", "text"]] | None = None output_modalities: Sequence[Literal["audio", "text"]] | None = None ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None instructions: str | None = None @@ -77,8 +76,7 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings): "on the function choice configuration.", ), ] = None - temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None - max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None + max_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None def prepare_settings_dict(self, **kwargs) -> dict[str, Any]: diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py index 98653e103153..304b4e4efff1 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/_open_ai_realtime.py @@ -255,8 +255,8 @@ class ListenEvents(str, Enum): RESPONSE_OUTPUT_ITEM_DONE = "response.output_item.done" RESPONSE_CONTENT_PART_ADDED = "response.content_part.added" RESPONSE_CONTENT_PART_DONE = "response.content_part.done" - RESPONSE_TEXT_DELTA = "response.text.delta" - RESPONSE_TEXT_DONE = "response.text.done" + RESPONSE_TEXT_DELTA = "response.output_text.delta" + RESPONSE_TEXT_DONE = "response.output_text.done" RESPONSE_AUDIO_TRANSCRIPT_DELTA = "response.output_audio_transcript.delta" RESPONSE_AUDIO_TRANSCRIPT_DONE = "response.output_audio_transcript.done" RESPONSE_AUDIO_DELTA = "response.output_audio.delta" @@ -302,7 +302,12 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt might be of different types. """ match event.type: - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value | "response.audio_transcript.delta": + case ( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA.value + | "response.audio_transcript.delta" + | ListenEvents.RESPONSE_TEXT_DELTA.value + | "response.text.delta" + ): yield RealtimeTextEvent( service_type=event.type, service_event=event, @@ -312,15 +317,16 @@ async def _parse_event(self, event: RealtimeServerEvent) -> AsyncGenerator[Realt choice_index=0, ), ) - case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value | "response.audio_transcript.done": - yield RealtimeTextEvent( - service_type=event.type, - service_event=event, - text=TextContent( - inner_content=event, - text=event.transcript, # type: ignore - ), - ) + case ( + ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DONE.value + | "response.audio_transcript.done" + | ListenEvents.RESPONSE_TEXT_DONE.value + | "response.text.done" + ): + # Don't yield RealtimeTextEvent here — the deltas already streamed all + # the text. Emitting the full text again would cause duplicate output + # for any consumer that prints every RealtimeTextEvent. + yield RealtimeEvent(service_type=event.type, service_event=event) case ListenEvents.RESPONSE_OUTPUT_ITEM_ADDED.value: if event.item.type == "function_call" and event.item.call_id and event.item.name: # type: ignore self._call_id_to_function_map[event.item.call_id] = event.item.name # type: ignore @@ -723,24 +729,19 @@ async def _send(self, event: RealtimeClientEvent) -> None: # Only keep fields that are allowed in session updates # Note: output_modalities is not allowed in WebRTC session updates allowed_fields = { + "type", "instructions", "model", "max_output_tokens", "tools", "tool_choice", - "temperature", "prompt", "tracing", "truncation", } event_dict["session"] = {k: v for k, v in session_dict.items() if k in allowed_fields} - # Debug: Log what we're sending to see the structure - import json - - json_data = json.dumps(event_dict) - logger.debug(f"Sending WebRTC session.update: {json_data}") - self.data_channel.send(json_data) + self.data_channel.send(json.dumps(event_dict)) else: self.data_channel.send(event.model_dump_json(exclude_none=True)) except Exception as e: @@ -860,8 +861,18 @@ async def _on_data(self, data: str) -> None: await self._receive_buffer.put(parsed_event) async def _get_ephemeral_token(self) -> str: - """Get an ephemeral token from OpenAI.""" - data = {"model": self.ai_model_id} + """Get an ephemeral token from OpenAI. + + GA endpoint: POST /v1/realtime/client_secrets + Request body: {"session": {"type": "realtime", "model": ""}} + Response: {"value": "", "expires_at": ..., "session": {...}} + """ + data = { + "session": { + "type": "realtime", + "model": self.ai_model_id, + } + } headers, url = self._get_ephemeral_token_headers_and_url() headers = prepend_semantic_kernel_to_user_agent(headers) try: @@ -874,22 +885,25 @@ async def _get_ephemeral_token(self) -> str: raise Exception(f"Failed to get ephemeral token: {error_text}") result = await response.json() - return result["client_secret"]["value"] + return result["value"] except Exception as e: logger.error(f"Failed to get ephemeral token: {e!s}") raise def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]: - """Get the headers for the ephemeral token.""" + """Get the headers and URL for the ephemeral token.""" return { "Authorization": f"Bearer {self.client.api_key}", "Content-Type": "application/json", - }, f"{self.client.realtime._client.base_url}/realtime/sessions" + }, f"{self.client.realtime._client.base_url}/realtime/client_secrets" def _get_webrtc_url(self) -> str: - """Get the WebRTC URL.""" - return f"{self.client.realtime._client.base_url}/realtime?model={self.ai_model_id}" + """Get the WebRTC URL. + + GA endpoint: POST /v1/realtime/calls?model= + """ + return f"{self.client.realtime._client.base_url}/realtime/calls?model={self.ai_model_id}" # region Websocket @@ -933,9 +947,6 @@ async def _send(self, event: RealtimeClientEvent) -> None: if not self.connection: raise ValueError("Connection is not established.") try: - # Debug logging to see what we're actually sending - if hasattr(event, "type") and event.type == "session.update": - logger.debug(f"Sending session.update event: {event.model_dump()}") await self.connection.send(event) except Exception as e: logger.error(f"Error sending response: {e!s}") diff --git a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py index fa45cf8eef97..9ae6f7558ec6 100644 --- a/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py +++ b/python/semantic_kernel/connectors/ai/open_ai/services/azure_realtime.py @@ -2,12 +2,15 @@ import logging import sys +import warnings from collections.abc import Callable, Coroutine, Mapping from typing import TYPE_CHECKING, Any +from aiohttp import ClientSession from azure.core.credentials import TokenCredential from openai import AsyncAzureOpenAI from openai.lib.azure import AsyncAzureADTokenProvider +from openai.resources.realtime.realtime import AsyncRealtimeConnection from pydantic import ValidationError from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import ( @@ -21,13 +24,18 @@ from semantic_kernel.connectors.ai.open_ai.services.open_ai_model_types import OpenAIModelTypes from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings +from semantic_kernel.const import USER_AGENT from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError from semantic_kernel.utils.feature_stage_decorator import experimental +from semantic_kernel.utils.telemetry.user_agent import SEMANTIC_KERNEL_USER_AGENT, prepend_semantic_kernel_to_user_agent if TYPE_CHECKING: from aiortc.mediastreams import MediaStreamTrack from numpy import ndarray + from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings + from semantic_kernel.contents.chat_history import ChatHistory + if sys.version_info >= (3, 12): from typing import override # pragma: no cover else: @@ -139,6 +147,50 @@ def __init__( def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]: return AzureRealtimeExecutionSettings + @override + async def create_session( + self, + chat_history: "ChatHistory | None" = None, + settings: "PromptExecutionSettings | None" = None, + **kwargs: Any, + ) -> None: + """Create a session in the service. + + The Azure GA Realtime endpoint (/openai/v1/realtime) does not accept + the api-version query parameter. The openai SDK always adds it, so we + bypass the SDK's _configure_realtime and build the connection directly. + """ + from websockets.asyncio.client import connect as ws_connect + + # Build the GA WebSocket URL: wss://.openai.azure.com/openai/v1/realtime?model= + # Note: GA uses ?model= (not ?deployment= which was preview) + # See: https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-websockets + endpoint = str(self.client._base_url).rstrip("/") # type: ignore[attr-defined] + if "/openai" in endpoint: + endpoint = endpoint[: endpoint.index("/openai")] + url = f"wss://{endpoint.split('://')[-1]}/openai/v1/realtime?model={self.ai_model_id}" + + # Build auth headers + auth_headers: dict[str, str] = {} + if self.client.api_key and self.client.api_key != "": + auth_headers["api-key"] = self.client.api_key + else: + token = await self.client._get_azure_ad_token() # type: ignore[attr-defined] + if token: + auth_headers["Authorization"] = f"Bearer {token}" + + ws = await ws_connect( + url, + additional_headers={ + **auth_headers, + USER_AGENT: SEMANTIC_KERNEL_USER_AGENT, + }, + ) + + self.connection = AsyncRealtimeConnection(ws) + self.connected.set() + await self.update_session(settings=settings, chat_history=chat_history, **kwargs) + @experimental class AzureRealtimeWebRTC(OpenAIRealtimeWebRTCBase, AzureOpenAIConfigBase): @@ -147,7 +199,7 @@ class AzureRealtimeWebRTC(OpenAIRealtimeWebRTCBase, AzureOpenAIConfigBase): def __init__( self, audio_track: "MediaStreamTrack", - region: str, + region: str | None = None, audio_output_callback: Callable[["ndarray"], Coroutine[Any, Any, None]] | None = None, service_id: str | None = None, api_key: str | None = None, @@ -165,14 +217,13 @@ def __init__( credential: TokenCredential | None = None, **kwargs: Any, ) -> None: - """Initialize an AzureRealtimeWebsocket service. + """Initialize an AzureRealtimeWebRTC service. Args: audio_track: The audio track to use for the service, only used by WebRTC. It can be any class that implements the AudioStreamTrack interface. - region: The region to use for the service. - This is required for WebRTC, and should be the same as the region of the Azure deployment. - Currently this can be "eastus2" or "swedencentral". + region: Deprecated. No longer needed for GA Realtime API. + Previously required for the preview WebRTC endpoint. audio_output_callback: The audio output callback, optional. This should be a coroutine, that takes a ndarray with audio as input. The goal of this function is to allow you to play the audio with the @@ -224,9 +275,15 @@ def __init__( raise ServiceInitializationError("Failed to create OpenAI settings.", ex) from ex if not azure_openai_settings.realtime_deployment_name: raise ServiceInitializationError("The OpenAI realtime model ID is required.") + if region is not None: + warnings.warn( + "The 'region' parameter is deprecated and no longer needed for the GA Realtime API. " + "The WebRTC endpoint is now derived from the resource endpoint.", + DeprecationWarning, + stacklevel=2, + ) if audio_track: kwargs["audio_track"] = audio_track - kwargs["region"] = region super().__init__( api_key=azure_openai_settings.api_key.get_secret_value() if azure_openai_settings.api_key else None, audio_output_callback=audio_output_callback, @@ -251,11 +308,27 @@ def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]: @override def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]: - """Get the headers and URL for the ephemeral token.""" - url = ( - f"{self.client.realtime._client.base_url}/realtimeapi/sessions?api-version=" - f"{self.client._api_version}" # type: ignore[attr-defined] - ) + """Get the headers and URL for the ephemeral token. + + Uses the GA endpoint format: POST /openai/v1/realtime/client_secrets + See: https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc + """ + endpoint = str(self.client._base_url).rstrip("/") # type: ignore[attr-defined] + # Strip any trailing path segments to get the base Azure resource URL + # base_url typically looks like https://.openai.azure.com/openai/... + # We need: https://.openai.azure.com/openai/v1/realtime/client_secrets + if "/openai" in endpoint: + endpoint = endpoint[: endpoint.index("/openai")] + url = f"{endpoint}/openai/v1/realtime/client_secrets" + + if self.client.api_key and self.client.api_key != "": + return ( + { + "api-key": self.client.api_key, + "Content-Type": "application/json", + }, + url, + ) if self.client._azure_ad_token is not None: # type: ignore[attr-defined] return ( { @@ -264,20 +337,51 @@ def _get_ephemeral_token_headers_and_url(self) -> tuple[dict[str, str], str]: }, url, ) - return ( - { - "Authorization": f"Bearer {self.client.api_key}", - "Content-Type": "application/json", - }, - url, - ) + raise ServiceInitializationError("No API key or Azure AD token available for ephemeral token request.") + + @override + async def _get_ephemeral_token(self) -> str: + """Get an ephemeral token from Azure OpenAI. + + Azure GA requires a nested session object: + {"session": {"type": "realtime", "model": ""}} + And returns the token directly as {"value": "..."} rather than + OpenAI's {"client_secret": {"value": "..."}}. + See: https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc + """ + data = { + "session": { + "type": "realtime", + "model": self.ai_model_id, + } + } + headers, url = self._get_ephemeral_token_headers_and_url() + headers = prepend_semantic_kernel_to_user_agent(headers) + try: + async with ( + ClientSession() as session, + session.post(url, headers=headers, json=data) as response, + ): + if response.status not in [200, 201]: + error_text = await response.text() + raise Exception(f"Failed to get ephemeral token: {error_text}") + + result = await response.json() + # Azure GA format returns {"value": "token"} directly + return result["value"] + + except Exception as e: + logger.error(f"Failed to get ephemeral token: {e!s}") + raise @override def _get_webrtc_url(self) -> str: - """Get the webrtc URL.""" - if not self.model_extra: - raise ServiceInitializationError("The region is required for WebRTC.") - region = self.model_extra.get("region") - if not region: - raise ServiceInitializationError("The region is required for WebRTC.") - return f"https://{region}.realtimeapi-preview.ai.azure.com/v1/realtimertc" + """Get the WebRTC URL. + + Uses the GA endpoint format: /openai/v1/realtime/calls + See: https://learn.microsoft.com/en-us/azure/ai-foundry/openai/how-to/realtime-audio-webrtc + """ + endpoint = str(self.client._base_url).rstrip("/") # type: ignore[attr-defined] + if "/openai" in endpoint: + endpoint = endpoint[: endpoint.index("/openai")] + return f"{endpoint}/openai/v1/realtime/calls" diff --git a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py index 3a7ad49c0732..64d7a6a64b58 100644 --- a/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py +++ b/python/tests/unit/connectors/ai/open_ai/services/test_openai_realtime.py @@ -120,6 +120,7 @@ arguments="{}", call_id="call_id", item_id="item_id", + name="function_name", output_index=0, response_id="response_id", ), @@ -383,6 +384,7 @@ def test_create_openai_realtime_event( event_id="event_id", output_index=0, item_id="item_id", + name="function_name", response_id="response_id", type="response.function_call_arguments.done", ), @@ -494,6 +496,7 @@ async def test_parse_function_call_arguments_done(OpenAIWebsocket, kernel): event_id="event_id", output_index=0, item_id="item_id", + name="plugin_name-function_name", response_id="response_id", type="response.function_call_arguments.done", ) @@ -533,6 +536,7 @@ async def test_parse_function_call_arguments_done_fail(OpenAIWebsocket, kernel): event_id="event_id", output_index=0, item_id="item_id", + name="function_name", response_id="response_id", type="response.function_call_arguments.done", )