microsoft · moonbox3 · Feb 28, 2026 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
@@ -37,7 +37,7 @@ dependencies = [
     "numpy >= 1.25.0; python_version < '3.12'",
     "numpy >= 1.26.0; python_version >= '3.12'",
     # openai connector
-    "openai >= 1.98.0,<2",
+    "openai >= 2.0.0",
     # openapi and swagger
     "openapi_core >= 0.18,<0.20",
     "websockets >= 13, < 16",

@@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T
 To run these samples, you will need to have the following setup:
 
 - Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set.
-- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`.
+- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2025-08-28`.
 - To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment:
   - semantic-kernel[realtime]
   - pyaudio

@@ -5,8 +5,6 @@
 from datetime import datetime
 from random import randint
 
-from azure.identity import AzureCliCredential
-
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai import FunctionChoiceBehavior
 from semantic_kernel.connectors.ai.open_ai import (
@@ -81,8 +79,12 @@ async def main() -> None:
     # and can also be passed in the receive method
     # You can also pass in kernel, plugins, chat_history or settings here.
     # For WebRTC the audio_track is required
+
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
     realtime_agent = AzureRealtimeWebRTC(
-        audio_track=AudioRecorderWebRTC(), region="swedencentral", plugins=[Helpers()], credential=AzureCliCredential()
+        audio_track=AudioRecorderWebRTC(),
+        plugins=[Helpers()],
     )
 
     # Create the settings for the session
@@ -103,6 +105,7 @@ async def main() -> None:
     flowery prose.
     """,
         voice="alloy",
+        output_modalities=["text", "audio"],
         turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
         function_choice_behavior=FunctionChoiceBehavior.Auto(),
     )

@@ -82,6 +82,9 @@ async def main() -> None:
     # to signal the end of the user's turn and start the response.
     # manual VAD is not part of this sample
     # for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection
+
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
     settings = AzureRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and

@@ -5,10 +5,11 @@
 
 from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
+    AzureRealtimeExecutionSettings,
     ListenEvents,
-    OpenAIRealtimeExecutionSettings,
-    OpenAIRealtimeWebRTC,
 )
+from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC
+from semantic_kernel.contents import RealtimeTextEvent
 
 logging.basicConfig(level=logging.WARNING)
 utils_log = logging.getLogger("samples.concepts.realtime.utils")
@@ -42,7 +43,7 @@ async def main() -> None:
     # create the realtime client and optionally add the audio output function, this is optional
     # you can define the protocol to use, either "websocket" or "webrtc"
     # they will behave the same way, even though the underlying protocol is quite different
-    settings = OpenAIRealtimeExecutionSettings(
+    settings = AzureRealtimeExecutionSettings(
         instructions="""
     You are a chat bot. Your name is Mosscap and
     you have one goal: figure out what people need.
@@ -55,28 +56,40 @@ async def main() -> None:
         # see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
         # for more details.
         voice="alloy",
+        # Enable both text and audio output to get transcripts
+        output_modalities=["text", "audio"],
+    )
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
+    realtime_client = AzureRealtimeWebRTC(
+        audio_track=AudioRecorderWebRTC(),
+        settings=settings,
     )
-    realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
     # Create the settings for the session
     audio_player = AudioPlayerWebRTC()
     # the context manager calls the create_session method on the client and starts listening to the audio stream
     async with audio_player, realtime_client:
         async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
-            match event.event_type:
-                case "text":
-                    # the model returns both audio and transcript of the audio, which we will print
-                    print(event.text.text, end="")
-                case "service":
-                    # OpenAI Specific events
-                    if event.service_type == ListenEvents.SESSION_UPDATED:
-                        print("Session updated")
-                    if event.service_type == ListenEvents.RESPONSE_CREATED:
-                        print("\nMosscap (transcript): ", end="")
+            match event:
+                case RealtimeTextEvent():
+                    # Only process delta events for streaming, skip done events to avoid duplication
+                    if event.service_type and "delta" in event.service_type and event.text.text:
+                        print(event.text.text, end="", flush=True)
+                    # Add newline when transcript is complete (done event)
+                    elif event.service_type and "done" in event.service_type:
+                        print()  # Add newline for readability
+                case _:
+                    # Handle service events
+                    if event.event_type == "service" and event.service_type:
+                        if event.service_type == ListenEvents.SESSION_UPDATED:
+                            print("Session updated")
+                        elif event.service_type == ListenEvents.RESPONSE_CREATED:
+                            print("\nMosscap (transcript): ", end="")
 
 
 if __name__ == "__main__":
     print(
-        "Instructions: start speaking. "
+        "Instructions: start speaking when you see 'Session updated.' "
         "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )

@@ -3,8 +3,6 @@
 import asyncio
 import logging
 
-from azure.identity import AzureCliCredential
-
 from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
 from semantic_kernel.connectors.ai.open_ai import (
     AzureRealtimeExecutionSettings,
@@ -59,7 +57,11 @@ async def main() -> None:
         # for more details.
         voice="shimmer",
     )
-    realtime_client = AzureRealtimeWebsocket(settings=settings, credential=AzureCliCredential())
+    # Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
+    # for Azure OpenAI deployments realtime deployments.
+    realtime_client = AzureRealtimeWebsocket(
+        settings=settings,
+    )
     audio_player = AudioPlayerWebsocket()
     audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
     # Create the settings for the session
@@ -84,7 +86,7 @@ async def main() -> None:
 
 if __name__ == "__main__":
     print(
-        "Instructions: Start speaking. "
+        "Instructions: Start speaking when you see 'Session updated.' "
         "The model will detect when you stop and automatically start responding. "
         "Press ctrl + c to stop the program."
     )

@@ -321,6 +321,7 @@ def _sounddevice_callback(self, outdata, frames, time, status):
             logger.debug(f"Audio output status: {status}")
         if self._queue:
             if self._queue.empty():
+                outdata[:] = 0
                 return
             data = self._queue.get_nowait()
             outdata[:] = data.reshape(outdata.shape)

@@ -2,4 +2,4 @@
 
 from typing import Final
 
-DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"
+DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"
@@ -54,7 +54,7 @@ class TurnDetection(KernelBaseModel):
 class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
     """Request settings for OpenAI realtime services."""
 
-    modalities: Sequence[Literal["audio", "text"]] | None = None
+    output_modalities: Sequence[Literal["audio", "text"]] | None = None
     ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
     instructions: str | None = None
     voice: str | None = None
@@ -76,10 +76,52 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
             "on the function choice configuration.",
         ),
     ] = None
-    temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None
-    max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
+    max_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
     input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None
 
+    def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
+        """Prepare the settings as a dictionary for sending to the AI service.
+
+        For realtime settings, we need to properly structure the audio configuration
+        to match the OpenAI API expectations where voice and turn_detection are nested
+        under the audio field.
+        """
+        # Get the base settings dict (excludes service_id, extension_data, etc.)
+        settings_dict = super().prepare_settings_dict(**kwargs)
+
+        # Build the audio configuration object
+        audio_config: dict[str, Any] = {}
+
+        # Handle voice (goes in audio.output.voice)
+        if "voice" in settings_dict:
+            audio_config.setdefault("output", {})["voice"] = settings_dict.pop("voice")
+
+        # Handle turn_detection (goes in audio.input.turn_detection)
+        if "turn_detection" in settings_dict:
+            audio_config.setdefault("input", {})["turn_detection"] = settings_dict.pop("turn_detection")
+
+        # Handle input audio format
+        if "input_audio_format" in settings_dict:
+            audio_config.setdefault("input", {})["format"] = settings_dict.pop("input_audio_format")
+
+        # Handle output audio format
+        if "output_audio_format" in settings_dict:
+            audio_config.setdefault("output", {})["format"] = settings_dict.pop("output_audio_format")
+
+        # Handle input audio transcription
+        if "input_audio_transcription" in settings_dict:
+            audio_config.setdefault("input", {})["transcription"] = settings_dict.pop("input_audio_transcription")
+
+        # Handle input audio noise reduction
+        if "input_audio_noise_reduction" in settings_dict:
+            audio_config.setdefault("input", {})["noise_reduction"] = settings_dict.pop("input_audio_noise_reduction")
+
+        # Add the audio config if it has any content
+        if audio_config:
+            settings_dict["audio"] = audio_config
+
+        return settings_dict
+
 
 class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings):
     """Request settings for Azure OpenAI realtime services."""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -2,4 +2,4 @@

		from typing import Final

		DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"
		DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"