Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ dependencies = [
"numpy >= 1.25.0; python_version < '3.12'",
"numpy >= 1.26.0; python_version >= '3.12'",
# openai connector
"openai >= 1.98.0,<2",
"openai >= 2.0.0",
# openapi and swagger
"openapi_core >= 0.18,<0.20",
"websockets >= 13, < 16",
Expand Down
2 changes: 1 addition & 1 deletion python/samples/concepts/realtime/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ These samples are more complex then most because of the nature of these API's. T
To run these samples, you will need to have the following setup:

- Environment variables for OpenAI (websocket or WebRTC), with your key and OPENAI_REALTIME_MODEL_ID set.
- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2024-10-01-preview`.
- Environment variables for Azure (websocket only), set with your endpoint, optionally a key and AZURE_OPENAI_REALTIME_DEPLOYMENT_NAME set. The API version needs to be at least `2025-08-28`.
- To run the sample with a simple version of a class that handles the incoming and outgoing sound you need to install the following packages in your environment:
- semantic-kernel[realtime]
- pyaudio
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from datetime import datetime
from random import randint

from azure.identity import AzureCliCredential

from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
Expand Down Expand Up @@ -81,8 +79,12 @@ async def main() -> None:
# and can also be passed in the receive method
# You can also pass in kernel, plugins, chat_history or settings here.
# For WebRTC the audio_track is required

# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
# for Azure OpenAI deployments realtime deployments.
realtime_agent = AzureRealtimeWebRTC(
audio_track=AudioRecorderWebRTC(), region="swedencentral", plugins=[Helpers()], credential=AzureCliCredential()
audio_track=AudioRecorderWebRTC(),
plugins=[Helpers()],
)

# Create the settings for the session
Expand All @@ -103,6 +105,7 @@ async def main() -> None:
flowery prose.
""",
voice="alloy",
output_modalities=["text", "audio"],
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ async def main() -> None:
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
# for more info: https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-turn_detection

# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
# for Azure OpenAI deployments realtime deployments.
settings = AzureRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
Expand Down
43 changes: 28 additions & 15 deletions python/samples/concepts/realtime/simple_realtime_chat_webrtc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

from samples.concepts.realtime.utils import AudioPlayerWebRTC, AudioRecorderWebRTC, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
AzureRealtimeExecutionSettings,
ListenEvents,
OpenAIRealtimeExecutionSettings,
OpenAIRealtimeWebRTC,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_realtime import AzureRealtimeWebRTC
from semantic_kernel.contents import RealtimeTextEvent

logging.basicConfig(level=logging.WARNING)
utils_log = logging.getLogger("samples.concepts.realtime.utils")
Expand Down Expand Up @@ -42,7 +43,7 @@ async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
settings = OpenAIRealtimeExecutionSettings(
settings = AzureRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Expand All @@ -55,28 +56,40 @@ async def main() -> None:
# see https://platform.openai.com/docs/api-reference/realtime-sessions/create#realtime-sessions-create-voice
# for more details.
voice="alloy",
# Enable both text and audio output to get transcripts
output_modalities=["text", "audio"],
)
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
# for Azure OpenAI deployments realtime deployments.
realtime_client = AzureRealtimeWebRTC(
audio_track=AudioRecorderWebRTC(),
settings=settings,
)
realtime_client = OpenAIRealtimeWebRTC(audio_track=AudioRecorderWebRTC(), settings=settings)
# Create the settings for the session
audio_player = AudioPlayerWebRTC()
# the context manager calls the create_session method on the client and starts listening to the audio stream
async with audio_player, realtime_client:
async for event in realtime_client.receive(audio_output_callback=audio_player.client_callback):
match event.event_type:
case "text":
# the model returns both audio and transcript of the audio, which we will print
print(event.text.text, end="")
case "service":
# OpenAI Specific events
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
if event.service_type == ListenEvents.RESPONSE_CREATED:
print("\nMosscap (transcript): ", end="")
match event:
case RealtimeTextEvent():
# Only process delta events for streaming, skip done events to avoid duplication
if event.service_type and "delta" in event.service_type and event.text.text:
print(event.text.text, end="", flush=True)
# Add newline when transcript is complete (done event)
elif event.service_type and "done" in event.service_type:
print() # Add newline for readability
case _:
# Handle service events
if event.event_type == "service" and event.service_type:
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
elif event.service_type == ListenEvents.RESPONSE_CREATED:
print("\nMosscap (transcript): ", end="")


if __name__ == "__main__":
print(
"Instructions: start speaking. "
"Instructions: start speaking when you see 'Session updated.' "
"The model will detect when you stop and automatically start responding. "
"Press ctrl + c to stop the program."
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import asyncio
import logging

from azure.identity import AzureCliCredential

from samples.concepts.realtime.utils import AudioPlayerWebsocket, AudioRecorderWebsocket, check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
AzureRealtimeExecutionSettings,
Expand Down Expand Up @@ -59,7 +57,11 @@ async def main() -> None:
# for more details.
voice="shimmer",
)
realtime_client = AzureRealtimeWebsocket(settings=settings, credential=AzureCliCredential())
# Note: api_version (either through settings or directly in the client) must be set to "2025-08-28"
# for Azure OpenAI deployments realtime deployments.
realtime_client = AzureRealtimeWebsocket(
settings=settings,
)
audio_player = AudioPlayerWebsocket()
audio_recorder = AudioRecorderWebsocket(realtime_client=realtime_client)
# Create the settings for the session
Expand All @@ -84,7 +86,7 @@ async def main() -> None:

if __name__ == "__main__":
print(
"Instructions: Start speaking. "
"Instructions: Start speaking when you see 'Session updated.' "
"The model will detect when you stop and automatically start responding. "
"Press ctrl + c to stop the program."
)
Expand Down
1 change: 1 addition & 0 deletions python/samples/concepts/realtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ def _sounddevice_callback(self, outdata, frames, time, status):
logger.debug(f"Audio output status: {status}")
if self._queue:
if self._queue.empty():
outdata[:] = 0
return
data = self._queue.get_nowait()
outdata[:] = data.reshape(outdata.shape)
Expand Down
2 changes: 1 addition & 1 deletion python/semantic_kernel/connectors/ai/open_ai/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

from typing import Final

DEFAULT_AZURE_API_VERSION: Final[str] = "2024-10-21"
DEFAULT_AZURE_API_VERSION: Final[str] = "2025-08-28"
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ class TurnDetection(KernelBaseModel):
class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
"""Request settings for OpenAI realtime services."""

modalities: Sequence[Literal["audio", "text"]] | None = None
output_modalities: Sequence[Literal["audio", "text"]] | None = None
ai_model_id: Annotated[str | None, Field(None, serialization_alias="model")] = None
instructions: str | None = None
voice: str | None = None
Expand All @@ -76,10 +76,52 @@ class OpenAIRealtimeExecutionSettings(PromptExecutionSettings):
"on the function choice configuration.",
),
] = None
temperature: Annotated[float | None, Field(ge=0.6, le=1.2)] = None
max_response_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
max_output_tokens: Annotated[int | Literal["inf"] | None, Field(gt=0)] = None
input_audio_noise_reduction: dict[Literal["type"], Literal["near_field", "far_field"]] | None = None

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings as a dictionary for sending to the AI service.

For realtime settings, we need to properly structure the audio configuration
to match the OpenAI API expectations where voice and turn_detection are nested
under the audio field.
"""
# Get the base settings dict (excludes service_id, extension_data, etc.)
settings_dict = super().prepare_settings_dict(**kwargs)

# Build the audio configuration object
audio_config: dict[str, Any] = {}

# Handle voice (goes in audio.output.voice)
if "voice" in settings_dict:
audio_config.setdefault("output", {})["voice"] = settings_dict.pop("voice")

# Handle turn_detection (goes in audio.input.turn_detection)
if "turn_detection" in settings_dict:
audio_config.setdefault("input", {})["turn_detection"] = settings_dict.pop("turn_detection")

# Handle input audio format
if "input_audio_format" in settings_dict:
audio_config.setdefault("input", {})["format"] = settings_dict.pop("input_audio_format")

# Handle output audio format
if "output_audio_format" in settings_dict:
audio_config.setdefault("output", {})["format"] = settings_dict.pop("output_audio_format")

# Handle input audio transcription
if "input_audio_transcription" in settings_dict:
audio_config.setdefault("input", {})["transcription"] = settings_dict.pop("input_audio_transcription")

# Handle input audio noise reduction
if "input_audio_noise_reduction" in settings_dict:
audio_config.setdefault("input", {})["noise_reduction"] = settings_dict.pop("input_audio_noise_reduction")

# Add the audio config if it has any content
if audio_config:
settings_dict["audio"] = audio_config

return settings_dict


class AzureRealtimeExecutionSettings(OpenAIRealtimeExecutionSettings):
"""Request settings for Azure OpenAI realtime services."""
Expand Down
Loading
Loading