Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/realtime/twilio/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ This example demonstrates how to connect the OpenAI Realtime API to a phone call

- **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible
- **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality
- **Audio jittering/skipping**: The implementation includes audio buffering (50ms chunks) to reduce jittering at word boundaries. This buffers both incoming (Twilio → OpenAI) and outgoing (OpenAI → Twilio) audio for smoother playback.
- **Latency**: Network latency between Twilio, your server, and OpenAI affects response time
- **Logs**: Check the console output for detailed connection and error logs

Expand Down
90 changes: 67 additions & 23 deletions examples/realtime/twilio/twilio_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,15 @@ def __init__(self, twilio_websocket: WebSocket):
self.BUFFER_SIZE_BYTES = int(self.SAMPLE_RATE * self.CHUNK_LENGTH_S) # 50ms worth of audio

self._stream_sid: str | None = None

# Incoming audio buffer (from Twilio to OpenAI)
self._audio_buffer: bytearray = bytearray()
self._last_buffer_send_time = time.time()

# Outgoing audio buffer (from OpenAI to Twilio) - NEW
Copy link

Copilot AI Oct 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove the '- NEW' suffix from the comment as it's temporary documentation that shouldn't remain in production code.

Suggested change
# Outgoing audio buffer (from OpenAI to Twilio) - NEW
# Outgoing audio buffer (from OpenAI to Twilio)

Copilot uses AI. Check for mistakes.

self._outgoing_audio_buffer: bytearray = bytearray()
self._last_outgoing_send_time = time.time()

# Mark event tracking for playback
self._mark_counter = 0
self._mark_data: dict[
Expand Down Expand Up @@ -122,18 +128,10 @@ async def _twilio_message_loop(self) -> None:
async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
"""Handle events from the realtime session."""
if event.type == "audio":
base64_audio = base64.b64encode(event.audio.data).decode("utf-8")
await self.twilio_websocket.send_text(
json.dumps(
{
"event": "media",
"streamSid": self._stream_sid,
"media": {"payload": base64_audio},
}
)
)
# Buffer outgoing audio to reduce jittering
self._outgoing_audio_buffer.extend(event.audio.data)

# Send mark event for playback tracking
# Store metadata for this audio chunk
Comment on lines 133 to 136
Copy link

Copilot AI Oct 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The audio buffering logic and metadata storage are tightly coupled. Consider extracting the mark counter logic into a separate method to improve separation of concerns and make the code more maintainable.

Copilot uses AI. Check for mistakes.

self._mark_counter += 1
mark_id = str(self._mark_counter)
self._mark_data[mark_id] = (
Expand All @@ -142,23 +140,24 @@ async def _handle_realtime_event(self, event: RealtimeSessionEvent) -> None:
len(event.audio.data),
)

await self.twilio_websocket.send_text(
json.dumps(
{
"event": "mark",
"streamSid": self._stream_sid,
"mark": {"name": mark_id},
}
)
)
# Send buffered audio if we have enough data (reduces jittering)
if len(self._outgoing_audio_buffer) >= self.BUFFER_SIZE_BYTES:
await self._flush_outgoing_audio_buffer(mark_id)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Flush combines audio but drops mark metadata

Outgoing audio chunks now accumulate in _outgoing_audio_buffer, but _handle_realtime_event still allocates a new mark entry for every chunk and only passes the mark id of the most recent chunk to _flush_outgoing_audio_buffer. When the buffer contains multiple chunks, Twilio receives a single mark message that represents only the last chunk’s byte count while the earlier marks stay in _mark_data forever and are never acknowledged. This causes playback tracking to under-report most of the audio that was actually sent and leaks entries in _mark_data over long calls. Consider aggregating the byte count for all buffered chunks into one mark or clearing the unused mark metadata when the combined buffer is flushed.

Useful? React with 👍 / 👎.


elif event.type == "audio_interrupted":
print("Sending audio interrupted to Twilio")
# Flush any remaining buffered audio before clearing
if self._outgoing_audio_buffer:
await self._flush_outgoing_audio_buffer(None)
Copy link

Copilot AI Oct 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The pattern of checking buffer existence before flushing is repeated multiple times. Consider having _flush_outgoing_audio_buffer handle the empty buffer check internally to reduce code duplication.

Copilot uses AI. Check for mistakes.

await self.twilio_websocket.send_text(
json.dumps({"event": "clear", "streamSid": self._stream_sid})
)
self._outgoing_audio_buffer.clear()
elif event.type == "audio_end":
print("Audio end")
print("Audio end - flushing remaining buffered audio")
# Flush remaining audio at the end
if self._outgoing_audio_buffer:
await self._flush_outgoing_audio_buffer(None)
elif event.type == "raw_model_event":
pass
else:
Expand Down Expand Up @@ -246,19 +245,64 @@ async def _flush_audio_buffer(self) -> None:
except Exception as e:
print(f"Error sending buffered audio to OpenAI: {e}")

async def _flush_outgoing_audio_buffer(self, mark_id: str | None) -> None:
"""Send buffered audio to Twilio to reduce jittering."""
if not self._outgoing_audio_buffer:
return

try:
# Encode and send the buffered audio to Twilio
base64_audio = base64.b64encode(bytes(self._outgoing_audio_buffer)).decode("utf-8")
await self.twilio_websocket.send_text(
json.dumps(
{
"event": "media",
"streamSid": self._stream_sid,
"media": {"payload": base64_audio},
}
)
)

# Send mark event for playback tracking (if provided)
if mark_id is not None:
await self.twilio_websocket.send_text(
json.dumps(
{
"event": "mark",
"streamSid": self._stream_sid,
"mark": {"name": mark_id},
}
)
)

# Clear the buffer
self._outgoing_audio_buffer.clear()
self._last_outgoing_send_time = time.time()

except Exception as e:
print(f"Error sending buffered audio to Twilio: {e}")

async def _buffer_flush_loop(self) -> None:
"""Periodically flush audio buffer to prevent stale data."""
"""Periodically flush audio buffers to prevent stale data."""
try:
while True:
await asyncio.sleep(self.CHUNK_LENGTH_S) # Check every 50ms

# If buffer has data and it's been too long since last send, flush it
current_time = time.time()

# Flush incoming audio buffer (from Twilio to OpenAI) if stale
if (
self._audio_buffer
and current_time - self._last_buffer_send_time > self.CHUNK_LENGTH_S * 2
):
await self._flush_audio_buffer()

# Flush outgoing audio buffer (from OpenAI to Twilio) if stale
if (
self._outgoing_audio_buffer
and current_time - self._last_outgoing_send_time > self.CHUNK_LENGTH_S * 2
):
await self._flush_outgoing_audio_buffer(None)

except Exception as e:
print(f"Error in buffer flush loop: {e}")