From 8847346c6d9307edb6d87a9002bc8fd4a930d77e Mon Sep 17 00:00:00 2001 From: Lucy Gavin Date: Tue, 21 Oct 2025 12:06:07 +0100 Subject: [PATCH 1/3] add tts --- .DS_Store | Bin 0 -> 6148 bytes 1_nb_async_client_ | 272 +++++++++++ examples/tts/realtime_parsing_example.py | 307 ++++++++++++ examples/tts/streaming_example.py | 243 ++++++++++ sdk/TTS/MIGRATION.md | 64 +++ sdk/TTS/README.md | 274 +++++++++++ sdk/TTS/pyproject.toml | 52 +++ sdk/TTS/speechmatics/__init__.py | 0 sdk/TTS/speechmatics/tts/__init__.py | 59 +++ sdk/TTS/speechmatics/tts/_async_client.py | 249 ++++++++++ sdk/TTS/speechmatics/tts/_auth.py | 162 +++++++ sdk/TTS/speechmatics/tts/_exceptions.py | 40 ++ sdk/TTS/speechmatics/tts/_helpers.py | 57 +++ sdk/TTS/speechmatics/tts/_logging.py | 49 ++ sdk/TTS/speechmatics/tts/_models.py | 31 ++ .../tts/_realtime_streaming_client.py | 440 ++++++++++++++++++ sdk/TTS/speechmatics/tts/_streaming_client.py | 397 ++++++++++++++++ sdk/TTS/speechmatics/tts/_transport.py | 306 ++++++++++++ 18 files changed, 3002 insertions(+) create mode 100644 .DS_Store create mode 100644 1_nb_async_client_ create mode 100644 examples/tts/realtime_parsing_example.py create mode 100644 examples/tts/streaming_example.py create mode 100644 sdk/TTS/MIGRATION.md create mode 100644 sdk/TTS/README.md create mode 100644 sdk/TTS/pyproject.toml create mode 100644 sdk/TTS/speechmatics/__init__.py create mode 100644 sdk/TTS/speechmatics/tts/__init__.py create mode 100644 sdk/TTS/speechmatics/tts/_async_client.py create mode 100644 sdk/TTS/speechmatics/tts/_auth.py create mode 100644 sdk/TTS/speechmatics/tts/_exceptions.py create mode 100644 sdk/TTS/speechmatics/tts/_helpers.py create mode 100644 sdk/TTS/speechmatics/tts/_logging.py create mode 100644 sdk/TTS/speechmatics/tts/_models.py create mode 100644 sdk/TTS/speechmatics/tts/_realtime_streaming_client.py create mode 100644 sdk/TTS/speechmatics/tts/_streaming_client.py create mode 100644 sdk/TTS/speechmatics/tts/_transport.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..4a769cc8614ca64bed6f4d8b698541233c26c1ae GIT binary patch literal 6148 zcmeHLJx>Bb5Pd5UPz#9_7KXDlMkBGY(Bwp{Er_LwA_~Ssh-jc!} zVC^3<&g?F*%K@b^x|8g_y`7nzcQ1FhEC58lUCse%04ZvW@fqqy82h+%qKy7Iyl88 zIu@6g`BmF%@_1OjncGrY82KiL(GgbtuFv&TRQnposG)&?#!ZU_77s_om|Rv~`2tjVT737^4=I2%+iTW+ zzFp`&4@SOOdvt_VAM0HIzZmV=Y^hm=u6GJJ1)KtX1^D~mQDgKqh6?4UgF1NxAdJ(r zG#c|+f^$5LzQ#}?dB}uHB~+a@Nm!y$Zy zu6GJJ1#AUk>N3sy|K9lXzn$c+oB~dPKc#?(7m9^E-ICs0BR9u;tw-&M8XL!j3S|m) kdOMZ_-irBu1!?5-ctGfD3>DH0nfoCiWpJHS;71ks2J=>> async with AsyncClient(api_key="your-key") as client: + ... job = await client.submit_job("audio.wav") + ... result = await client.wait_for_completion(job.id) + ... print(result.transcript) + + With JWT authentication: + >>> from speechmatics.batch import JWTAuth + >>> auth = JWTAuth("your-api-key", ttl=3600) + >>> async with AsyncClient(auth=auth) as client: + ... # Use client with JWT auth + ... pass + """ + + def __init__( + self, + auth: Optional[AuthBase] = None, + *, + api_key: Optional[str] = None, + url: Optional[str] = None, + conn_config: Optional[ConnectionConfig] = None, + ) -> None: + """ + Initialize the AsyncClient. + + Args: + auth: Authentication method, it can be StaticKeyAuth or JWTAuth. + If None, creates StaticKeyAuth with the api_key. + api_key: Speechmatics API key. If None, uses SPEECHMATICS_API_KEY env var. + url: REST API endpoint URL. If None, uses SPEECHMATICS_BATCH_URL env var + or defaults to production endpoint. + conn_config: Complete connection configuration. + + Raises: + ConfigurationError: If auth is None and API key is not provided/found. + """ + self._auth = auth or StaticKeyAuth(api_key) + self._url = url or os.environ.get("SPEECHMATICS_BATCH_URL") or "https://asr.api.speechmatics.com/v2" + self._conn_config = conn_config or ConnectionConfig() + self._request_id = str(uuid.uuid4()) + self._transport = Transport(self._url, self._conn_config, self._auth, self._request_id) + + self._logger = get_logger(__name__) + self._logger.debug("AsyncClient initialized (request_id=%s, url=%s)", self._request_id, self._url) + + async def __aenter__(self) -> AsyncClient: + """ + Async context manager entry. + + Returns: + Self for use in async with statements. + + Examples: + >>> async with AsyncClient(api_key="key") as client: + ... job = await client.submit_job("audio.wav") + """ + return self + + async def synthesize_speech( + self, + text: str, + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Convert text to speech audio. + + Args: + text: Text to convert to speech. + voice: Voice ID to use for synthesis (e.g., "en-US-neural-1"). + output_format: Audio format ("wav", "mp3", "ogg"). + sample_rate: Audio sample rate in Hz (e.g., 22050, 44100). + speed: Speech speed multiplier (0.5 to 2.0). + + Returns: + Audio data as bytes. + + Raises: + AuthenticationError: If API key is invalid. + TransportError: If synthesis fails. + + Examples: + >>> response = await client.synthesize_speech("Hello world") + >>> audio_data = await response.read() + >>> with open("output.wav", "wb") as f: + ... f.write(audio_data) + """ + # Prepare synthesis request + request_data = { + "text": text, + "output_format": output_format, + } + + if voice: + request_data["voice"] = voice + if sample_rate: + request_data["sample_rate"] = str(sample_rate) + if speed: + request_data["speed"] = str(speed) + + response = await self._transport.post("/synthesize", json_data=request_data) + return response + + async def synthesize_from_file( + self, + file_path: Union[str, os.PathLike], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Convert text from a file to speech audio. + + Args: + file_path: Path to text or SSML file. + voice: Voice ID to use for synthesis. + output_format: Audio format ("wav", "mp3", "ogg"). + sample_rate: Audio sample rate in Hz. + speed: Speech speed multiplier (0.5 to 2.0). + + Returns: + Raw aiohttp ClientResponse object. + + Raises: + FileNotFoundError: If file doesn't exist. + AuthenticationError: If API key is invalid. + TransportError: If synthesis fails. + + Examples: + >>> response = await client.synthesize_from_file("script.txt") + >>> audio_data = await response.read() + >>> with open("output.wav", "wb") as f: + ... f.write(audio_data) + """ + import aiofiles + from pathlib import Path + + file_path_obj = Path(file_path) + if not file_path_obj.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Read text content + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + text_content = await f.read() + + return await self.synthesize_speech( + text_content, + voice=voice, + output_format=output_format, + sample_rate=sample_rate, + speed=speed, + ) + + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + """ + Async context manager exit with automatic cleanup. + + Ensures all resources are properly cleaned up when exiting the + async context manager, including closing HTTP connections. + + Args: + exc_type: Exception type if an exception occurred. + exc_val: Exception value if an exception occurred. + exc_tb: Exception traceback if an exception occurred. + """ + await self.close() + + async def close(self) -> None: + """ + Close the client and cleanup all resources. + + This method ensures proper cleanup of all client resources including + closing HTTP connections and sessions. + + This method is safe to call multiple times and will handle cleanup + gracefully even if errors occur during the process. + + Examples: + >>> client = AsyncClient(api_key="key") + >>> try: + ... result = await client.transcribe("audio.wav") + >>> finally: + ... await client.close() + """ + try: + await self._transport.close() + except Exception: + pass # Best effort cleanup diff --git a/examples/tts/realtime_parsing_example.py b/examples/tts/realtime_parsing_example.py new file mode 100644 index 0000000..644aff9 --- /dev/null +++ b/examples/tts/realtime_parsing_example.py @@ -0,0 +1,307 @@ +""" +Real-time streaming TTS with live text parsing examples. + +This demonstrates true real-time TTS where audio generation happens +simultaneously with text parsing - no waiting for complete text processing! +""" + +import asyncio +import time +from pathlib import Path +from typing import AsyncIterator + +# Assuming the real-time streaming client is available +from speechmatics.tts import RealTimeStreamingTTSClient + + +async def live_file_parsing_example(): + """Demonstrate real-time file parsing with immediate audio synthesis.""" + print("šŸ“– Real-time File Parsing Example") + print("Audio starts playing while file is still being read!") + + # Create a sample file + sample_text = """ + Welcome to real-time streaming text-to-speech synthesis. + + This is paragraph one. As you can hear, the audio begins immediately, + even though the system is still reading and parsing the rest of this document. + + This is paragraph two. The synthesis happens in real-time as each sentence + is parsed from the file. There's no waiting for the entire document to load. + + This is paragraph three. The system intelligently finds sentence boundaries + and starts synthesis as soon as it has enough text to work with. + + This is the final paragraph. By the time you hear this, the system has + been continuously reading, parsing, and synthesizing throughout the entire process. + """ + + sample_file = Path("realtime_sample.txt") + sample_file.write_text(sample_text) + + try: + async with RealTimeStreamingTTSClient( + api_key="your-api-key", + synthesis_threshold=50, # Start synthesis after 50 characters + max_parse_delay=0.05, # Very responsive parsing + ) as client: + + print("šŸŽµ Starting real-time parsing and synthesis...") + start_time = time.time() + chunk_count = 0 + + async for audio_chunk in client.stream_live_from_file( + sample_file, + voice="en-US-neural-1", + read_delay=0.03, # Simulate fast reading + ): + chunk_count += 1 + elapsed = time.time() - start_time + + print(f"šŸ”Š Audio chunk {chunk_count} at {elapsed:.2f}s: {len(audio_chunk)} bytes") + + # In real application: await play_audio_immediately(audio_chunk) + # Simulate audio playback time + await asyncio.sleep(0.1) + + total_time = time.time() - start_time + print(f"āœ… Real-time processing complete in {total_time:.2f}s with {chunk_count} chunks") + + finally: + if sample_file.exists(): + sample_file.unlink() + + +async def live_typing_simulation(): + """Simulate user typing with immediate TTS synthesis.""" + print("\nāŒØļø Live Typing Simulation") + print("Audio generated as text is typed in real-time!") + + async def simulate_typing() -> AsyncIterator[str]: + """Simulate a user typing text in real-time.""" + text = "Hello there! I am typing this text in real time. Each word appears as I type it. The speech synthesis happens immediately as I finish each sentence. This is truly real-time text-to-speech!" + + words = text.split() + for i, word in enumerate(words): + yield word + " " + + # Simulate typing speed (faster for short words, slower for long ones) + typing_delay = 0.2 + (len(word) * 0.05) + await asyncio.sleep(typing_delay) + + # Add extra pause after sentences + if word.endswith(('.', '!', '?')): + await asyncio.sleep(0.3) + + async with RealTimeStreamingTTSClient( + api_key="your-api-key", + synthesis_threshold=30, # Quick synthesis trigger + ) as client: + + print("āŒØļø Starting typing simulation...") + start_time = time.time() + + async for audio_chunk in client.stream_live_text_input( + simulate_typing(), + voice="en-US-neural-2", + speed=1.1, + ): + elapsed = time.time() - start_time + print(f"šŸŽµ Live audio at {elapsed:.2f}s: {len(audio_chunk)} bytes") + + # In real app: await play_audio_immediately(audio_chunk) + + print("āœ… Live typing synthesis complete!") + + +async def live_monitoring_example(): + """Demonstrate real-time monitoring with performance metrics.""" + print("\nšŸ“Š Real-time Monitoring Example") + print("Shows performance metrics during live processing!") + + # Create a longer sample for monitoring + long_text = """ + Real-time Performance Monitoring Demo + + """ + "\n\n".join([ + f"Section {i}: This section demonstrates real-time performance monitoring " + f"during live text-to-speech synthesis. The system tracks processing rates, " + f"audio generation speed, buffer health, and latency metrics in real-time." + for i in range(1, 8) + ]) + + monitor_file = Path("monitoring_sample.txt") + monitor_file.write_text(long_text) + + try: + async with RealTimeStreamingTTSClient(api_key="your-api-key") as client: + + print("šŸ“Š Starting monitored real-time synthesis...") + + async for result in client.stream_with_live_monitoring( + monitor_file, + voice="en-US-neural-3", + monitor_interval=0.1, + ): + audio_chunk = result["audio_chunk"] + metrics = result["metrics"] + status = result["status"] + + # Display real-time metrics + print(f"šŸŽµ Chunk {metrics['chunk_id']}: " + f"{len(audio_chunk)} bytes | " + f"Rate: {metrics['chars_per_second']:.1f} chars/s | " + f"Latency: {metrics['elapsed_time']:.2f}s | " + f"RT Ratio: {status['real_time_ratio']:.2f}") + + # In real app: await play_audio_with_monitoring(audio_chunk, metrics) + + print("āœ… Monitored synthesis complete!") + + finally: + if monitor_file.exists(): + monitor_file.unlink() + + +async def streaming_vs_batch_comparison(): + """Compare real-time streaming vs traditional batch processing.""" + print("\n⚔ Streaming vs Batch Comparison") + + test_text = """ + This is a comparison test between real-time streaming synthesis + and traditional batch processing. In batch mode, you would wait + for this entire text to be processed before hearing any audio. + In streaming mode, you hear audio immediately as text is parsed. + """ + + test_file = Path("comparison_test.txt") + test_file.write_text(test_text) + + try: + print("🐌 Simulating BATCH processing (traditional approach):") + batch_start = time.time() + + # Simulate batch processing delay + print(" ā³ Loading entire file...") + await asyncio.sleep(1.0) + print(" ā³ Processing all text...") + await asyncio.sleep(2.0) + print(" ā³ Synthesizing complete audio...") + await asyncio.sleep(3.0) + + batch_time = time.time() - batch_start + print(f" āœ… Batch complete after {batch_time:.1f}s - NOW audio starts playing") + + print("\nšŸš€ Real-time STREAMING processing:") + stream_start = time.time() + + async with RealTimeStreamingTTSClient(api_key="your-api-key") as client: + first_audio = True + + async for audio_chunk in client.stream_live_from_file( + test_file, + read_delay=0.02, # Fast parsing + ): + if first_audio: + first_audio_time = time.time() - stream_start + print(f" šŸŽµ FIRST audio at {first_audio_time:.2f}s - Audio playing while still parsing!") + first_audio = False + + # Continue processing... + + stream_total = time.time() - stream_start + print(f" āœ… Streaming complete in {stream_total:.1f}s total") + + print(f"\nšŸ“ˆ Results:") + print(f" Batch: {batch_time:.1f}s wait before ANY audio") + print(f" Streaming: {first_audio_time:.2f}s to FIRST audio ({batch_time/first_audio_time:.1f}x faster!)") + + finally: + if test_file.exists(): + test_file.unlink() + + +async def live_document_reader(): + """Simulate a live document reader with real-time TTS.""" + print("\nšŸ“š Live Document Reader Simulation") + print("Simulates reading a document with real-time speech synthesis!") + + # Simulate a document being written/received in real-time + async def live_document_stream() -> AsyncIterator[str]: + """Simulate a document being written or received live.""" + sentences = [ + "Breaking news: Real-time text-to-speech technology has reached new heights. ", + "Scientists have developed a system that can synthesize speech as text is being written. ", + "This breakthrough enables immediate audio feedback for live content creation. ", + "Applications include live transcription, real-time translation, and accessibility tools. ", + "The system processes text incrementally, starting synthesis before complete sentences are finished. ", + "This represents a significant advancement in human-computer interaction. ", + "Users can now hear their text being spoken as they type or as content arrives. ", + "The technology promises to revolutionize how we interact with text-based systems. " + ] + + for sentence in sentences: + # Simulate sentence arriving word by word + words = sentence.split() + for word in words: + yield word + " " + await asyncio.sleep(0.15) # Simulate writing/receiving speed + + # Pause between sentences + await asyncio.sleep(0.5) + + async with RealTimeStreamingTTSClient( + api_key="your-api-key", + synthesis_threshold=40, # Start synthesis quickly + ) as client: + + print("šŸ“° Starting live document reader...") + + async for audio_chunk in client.stream_live_text_input( + live_document_stream(), + voice="en-US-neural-1", + output_format="wav", + ): + print(f"šŸ”Š Live audio: {len(audio_chunk)} bytes") + # In real app: await play_live_audio(audio_chunk) + + print("āœ… Live document reading complete!") + + +async def main(): + """Run all real-time streaming examples.""" + print("šŸš€ Real-time Streaming TTS Examples") + print("=" * 60) + print("šŸŽÆ Audio synthesis happens WHILE text is being parsed!") + print("=" * 60) + + examples = [ + live_file_parsing_example, + live_typing_simulation, + live_monitoring_example, + streaming_vs_batch_comparison, + live_document_reader, + ] + + for example in examples: + try: + await example() + except Exception as e: + print(f"āŒ Example failed: {e}") + + print("\n" + "-" * 40 + "\n") + await asyncio.sleep(1) + + print("šŸŽ‰ All real-time streaming examples completed!") + print("\nšŸ’” Key Benefits Demonstrated:") + print(" • Audio starts immediately (no waiting for complete parsing)") + print(" • Continuous synthesis during text processing") + print(" • Real-time performance monitoring") + print(" • Live input processing (typing, streaming content)") + print(" • Intelligent boundary detection for optimal synthesis") + + +if __name__ == "__main__": + print("āš ļø Remember to set your actual Speechmatics API key!") + print("šŸŽ§ In real usage, replace print statements with actual audio playback!") + asyncio.run(main()) diff --git a/examples/tts/streaming_example.py b/examples/tts/streaming_example.py new file mode 100644 index 0000000..9ab9cd1 --- /dev/null +++ b/examples/tts/streaming_example.py @@ -0,0 +1,243 @@ +""" +Example usage of the Streaming TTS Client. + +This example demonstrates real-time text-to-speech streaming with chunked processing, +WebSocket communication, and incremental audio delivery. +""" + +import asyncio +import io +from pathlib import Path + +# Assuming the streaming client is available +from speechmatics.tts import StreamingTTSClient + + +async def basic_streaming_example(): + """Basic streaming TTS example.""" + print("šŸŽµ Basic Streaming TTS Example") + + text = """ + This is a demonstration of streaming text-to-speech synthesis. + The text is being processed in real-time, chunk by chunk. + You should hear audio being generated as each segment is processed. + This enables immediate playback without waiting for the entire text to be synthesized. + """ + + async with StreamingTTSClient(api_key="your-api-key") as client: + print("šŸ“ Starting streaming synthesis...") + + audio_chunks = [] + async for audio_chunk in client.stream_speech( + text, + voice="en-US-neural-1", + output_format="wav", + sample_rate=22050 + ): + print(f"šŸ”Š Received audio chunk: {len(audio_chunk)} bytes") + audio_chunks.append(audio_chunk) + + # In a real application, you would play this chunk immediately + # await play_audio_chunk(audio_chunk) + + print(f"āœ… Streaming complete! Total chunks: {len(audio_chunks)}") + + # Save complete audio + with open("streaming_output.wav", "wb") as f: + for chunk in audio_chunks: + f.write(chunk) + + +async def file_streaming_example(): + """Stream TTS from a text file.""" + print("\nšŸ“„ File Streaming TTS Example") + + # Create a sample text file + sample_text = """ + Chapter 1: The Beginning + + In the realm of streaming text-to-speech, we embark on a journey of real-time audio generation. + Each sentence flows seamlessly into the next, creating a continuous stream of synthesized speech. + + The technology behind this process involves breaking down large texts into manageable chunks, + processing them through advanced neural networks, and delivering audio incrementally. + + This approach enables immediate playback, reduced memory usage, and better user experience + for applications dealing with long-form content like audiobooks, articles, and documents. + """ + + # Write sample file + sample_file = Path("sample_text.txt") + sample_file.write_text(sample_text) + + try: + async with StreamingTTSClient(api_key="your-api-key") as client: + print(f"šŸ“– Streaming from file: {sample_file}") + + chunk_count = 0 + async for audio_chunk in client.stream_from_file( + sample_file, + voice="en-US-neural-2", + output_format="mp3", + speed=1.1 + ): + chunk_count += 1 + print(f"šŸŽµ File chunk {chunk_count}: {len(audio_chunk)} bytes") + + # In real usage: await play_audio_chunk(audio_chunk) + + print(f"āœ… File streaming complete! Processed {chunk_count} chunks") + + finally: + # Cleanup + if sample_file.exists(): + sample_file.unlink() + + +async def incremental_file_streaming_example(): + """Stream very large files with incremental reading.""" + print("\nšŸ“š Incremental File Streaming Example") + + # Create a larger sample file + large_text = """ + The Art of Streaming Text-to-Speech + + """ + "\n\n".join([ + f"Paragraph {i}: This is a demonstration of incremental file processing. " + f"The system reads the file in chunks and processes them as they become available. " + f"This approach is particularly useful for very large documents, books, or articles " + f"where loading the entire content into memory might not be feasible." + for i in range(1, 21) + ]) + + large_file = Path("large_sample.txt") + large_file.write_text(large_text) + + try: + async with StreamingTTSClient( + api_key="your-api-key", + chunk_size=300, # Smaller chunks for demo + overlap_size=30 + ) as client: + print(f"šŸ“– Incremental streaming from: {large_file}") + + chunk_count = 0 + total_bytes = 0 + + async for audio_chunk in client.stream_from_file_incremental( + large_file, + voice="en-US-neural-3", + output_format="wav", + read_chunk_size=1024 # Read 1KB at a time + ): + chunk_count += 1 + total_bytes += len(audio_chunk) + print(f"šŸŽµ Incremental chunk {chunk_count}: {len(audio_chunk)} bytes " + f"(Total: {total_bytes} bytes)") + + # Simulate real-time playback delay + await asyncio.sleep(0.1) + + print(f"āœ… Incremental streaming complete! " + f"Chunks: {chunk_count}, Total audio: {total_bytes} bytes") + + finally: + # Cleanup + if large_file.exists(): + large_file.unlink() + + +async def real_time_playback_simulation(): + """Simulate real-time audio playback with streaming.""" + print("\nšŸŽ® Real-time Playback Simulation") + + text = """ + Welcome to the real-time streaming demonstration. + This example simulates how you would integrate streaming TTS + with an audio playback system for immediate user experience. + """ + + class AudioPlayer: + """Simulated audio player.""" + + def __init__(self): + self.buffer = io.BytesIO() + self.playing = False + + async def add_chunk(self, audio_chunk: bytes): + """Add audio chunk to playback buffer.""" + self.buffer.write(audio_chunk) + if not self.playing: + await self.start_playback() + + async def start_playback(self): + """Start audio playback (simulated).""" + self.playing = True + print("šŸ”Š Audio playback started...") + + # Simulate playback time + await asyncio.sleep(1.0) + + print("šŸ”‡ Audio playback finished") + self.playing = False + + player = AudioPlayer() + + async with StreamingTTSClient(api_key="your-api-key") as client: + print("šŸŽµ Starting real-time synthesis and playback...") + + # Process streaming TTS and playback concurrently + async for audio_chunk in client.stream_speech( + text, + voice="en-US-neural-1", + output_format="wav" + ): + print(f"šŸ“¦ Received chunk: {len(audio_chunk)} bytes") + await player.add_chunk(audio_chunk) + + print("āœ… Real-time streaming and playback complete!") + + +async def error_handling_example(): + """Demonstrate error handling in streaming TTS.""" + print("\nāš ļø Error Handling Example") + + async with StreamingTTSClient(api_key="invalid-key") as client: + try: + async for audio_chunk in client.stream_speech("Test text"): + print(f"Chunk: {len(audio_chunk)} bytes") + + except Exception as e: + print(f"āŒ Error occurred: {type(e).__name__}: {e}") + print("šŸ”§ In production, implement proper error recovery") + + +async def main(): + """Run all streaming TTS examples.""" + print("šŸš€ Speechmatics Streaming TTS Examples") + print("=" * 50) + + examples = [ + basic_streaming_example, + file_streaming_example, + incremental_file_streaming_example, + real_time_playback_simulation, + error_handling_example, + ] + + for example in examples: + try: + await example() + except Exception as e: + print(f"āŒ Example failed: {e}") + + print("\n" + "-" * 30 + "\n") + await asyncio.sleep(1) # Brief pause between examples + + print("šŸŽ‰ All examples completed!") + + +if __name__ == "__main__": + # Note: Replace "your-api-key" with actual Speechmatics API key + print("āš ļø Remember to set your actual Speechmatics API key!") + asyncio.run(main()) diff --git a/sdk/TTS/MIGRATION.md b/sdk/TTS/MIGRATION.md new file mode 100644 index 0000000..abedf51 --- /dev/null +++ b/sdk/TTS/MIGRATION.md @@ -0,0 +1,64 @@ +# Batch SDK Migration Guide + +This guide helps users migrate from the legacy Speechmatics Batch Client (`speechmatics-python`) to the new Speechmatics Batch SDK (`speechmatics-batch`). The new SDK provides a modern async API, improved error handling, and enhanced authentication options with minimal dependencies. + +## Significant Changes + +- **Fully async API**: All operations now use async/await pattern with `AsyncClient` +- **Enhanced authentication**: Support for both API key and JWT authentication methods +- **Better error handling**: More specific exceptions and clearer error messages +- **Lightweight package**: Minimal dependencies for faster installation and reduced conflicts +- **Improved job management**: Better job status tracking and result handling +- **Streamlined configuration**: Unified `JobConfig` for all job types +- **URL and API key configuration**: Allows loading URL and API key from environment variables + +### Breaking Changes + +- **Import paths**: `speechmatics.batch_client` → `speechmatics.batch` +- **Client class**: `BatchClient` → `AsyncClient` +- **All methods**: Synchronous → Asynchronous (requires `await`) +- **Configuration**: `BatchTranscriptionConfig` → `JobConfig` with `TranscriptionConfig` +- **Job submission**: Direct config parameter → Structured `JobConfig` object +- **Result format**: `transcription_format` parameter → `format_type` parameter +- **Authentication**: API key parameter naming changed to `api_key` +- **CLI not available**: CLI will be released as a separate package + +## Installation + +``` bash +pip install speechmatics-batch +``` + +## Usage + +Before + +```python +from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig +from speechmatics.batch_client import BatchClient + +with BatchClient("API-KEY") as client: + job_id = client.submit_job(PATH_TO_FILE, BatchTranscriptionConfig(LANGUAGE)) + + transcript = client.wait_for_completion(job_id, transcription_format='txt') + + print(transcript) +``` + +After + +```python +from speechmatics.batch import AsyncClient, FormatType, JobConfig, JobType, TranscriptionConfig + +async with AsyncClient(os.environ.get("SPEECHMATICS_API_KEY")) as client: + config = JobConfig( + type=JobType.TRANSCRIPTION, + transcription_config=TranscriptionConfig(language="en"), + ) + + job = await client.submit_job("audio.wav", config=config) + + result = await client.wait_for_completion(job.id, format_type=FormatType.TXT) + + print(f"Transcript: {result.transcript_text}") +``` diff --git a/sdk/TTS/README.md b/sdk/TTS/README.md new file mode 100644 index 0000000..a316002 --- /dev/null +++ b/sdk/TTS/README.md @@ -0,0 +1,274 @@ +# Speechmatics Batch API Client + +[![PyPI](https://img.shields.io/pypi/v/speechmatics-batch)](https://pypi.org/project/speechmatics-batch/) +![PythonSupport](https://img.shields.io/badge/Python-3.9%2B-green) + +Async Python client for Speechmatics Batch API. + +## Features + +- Async API client with comprehensive error handling +- Type hints throughout for better IDE support +- Environment variable support for credentials +- Easy-to-use interface for submitting, monitoring, and retrieving transcription jobs +- Full job configuration support with all Speechmatics features +- Intelligent transcript formatting with speaker diarization +- Support for multiple output formats (JSON, TXT, SRT) + +## Installation + +```bash +pip install speechmatics-batch +``` + +## Usage + +### Quick Start + +```python +import asyncio +from speechmatics.batch import AsyncClient + +async def main(): + # Create a client using environment variable SPEECHMATICS_API_KEY + async with AsyncClient() as client: + # Simple transcription + result = await client.transcribe("audio.wav") + print(result.transcript_text) + +asyncio.run(main()) +``` + +## JWT Authentication + +For enhanced security, use temporary JWT tokens instead of static API keys. +JWTs are short-lived (60 seconds default) and automatically refreshed: + +```python +from speechmatics.batch import AsyncClient, JWTAuth + +auth = JWTAuth("your-api-key", ttl=60) + +async with AsyncClient(auth=auth) as client: + # Tokens are cached and auto-refreshed automatically + result = await client.transcribe("audio.wav") + print(result.transcript_text) +``` + +Ideal for long-running applications or when minimizing API key exposure. +See the [authentication documentation](https://docs.speechmatics.com/introduction/authentication) for more details. + +### Basic Job Workflow + +```python +import asyncio +from speechmatics.batch import AsyncClient, JobConfig, JobType, TranscriptionConfig + +async def main(): + # Create client with explicit API key + async with AsyncClient(api_key="your-api-key") as client: + + # Configure transcription + config = JobConfig( + type=JobType.TRANSCRIPTION, + transcription_config=TranscriptionConfig( + language="en", + enable_entities=True, + diarization="speaker" + ) + ) + + # Submit job + job = await client.submit_job("audio.wav", config=config) + print(f"Job submitted: {job.id}") + + # Wait for completion + result = await client.wait_for_completion( + job.id, + polling_interval=2.0, + timeout=300.0 + ) + + # Access results + print(f"Transcript: {result.transcript_text}") + print(f"Confidence: {result.confidence}") + +asyncio.run(main()) +``` + +### Advanced Configuration + +```python +import asyncio +from speechmatics.batch import ( + AsyncClient, + JobConfig, + JobType, + OperatingPoint, + TranscriptionConfig, + TranslationConfig, + SummarizationConfig +) + +async def main(): + async with AsyncClient(api_key="your-api-key") as client: + + # Advanced job configuration + config = JobConfig( + type=JobType.TRANSCRIPTION, + transcription_config=TranscriptionConfig( + language="en", + operating_point=OperatingPoint.ENHANCED, + enable_entities=True, + diarization="speaker", + ), + translation_config=TranslationConfig(target_languages=["es", "fr"]), + summarization_config=SummarizationConfig( + content_type="conversational", summary_length="brief" + ), + ) + + result = await client.transcribe("audio.wav", config=config) + + # Access advanced features + if result.summary: + print(f"Summary: {result.summary}") + if result.translations: + print(f"Translations: {result.translations}") + +asyncio.run(main()) +``` + +### Manual Job Management + +```python +import asyncio +from speechmatics.batch import AsyncClient, JobStatus + +async def main(): + async with AsyncClient() as client: + + # Submit job + job = await client.submit_job("audio.wav") + + # Check job status + job_details = await client.get_job_info(job.id) + print(f"Status: {job_details.status}") + + # Wait for completion manually + while job_details.status == JobStatus.RUNNING: + await asyncio.sleep(5) + job_details = await client.get_job_info(job.id) + + if job_details.status == JobStatus.DONE: + # Get transcript + transcript = await client.get_transcript(job.id) + print(transcript.transcript_text) + else: + print(f"Job failed with status: {job_details.status}") + +asyncio.run(main()) +``` + +### Different Output Formats + +```python +import asyncio +from speechmatics.batch import AsyncClient, FormatType + +async def main(): + async with AsyncClient() as client: + job = await client.submit_job("audio.wav") + + # Get JSON format (default) + json_result = await client.get_transcript(job.id, format_type=FormatType.JSON) + print(json_result.transcript_text) + + # Get plain text + txt_result = await client.get_transcript(job.id, format_type=FormatType.TXT) + print(txt_result) + + # Get SRT subtitles + srt_result = await client.get_transcript(job.id, format_type=FormatType.SRT) + print(srt_result) + +asyncio.run(main()) +``` + +### Error Handling + +```python +import asyncio +from speechmatics.batch import ( + AsyncClient, + BatchError, + AuthenticationError, + JobError, + TimeoutError +) + +async def main(): + try: + async with AsyncClient() as client: + result = await client.transcribe("audio.wav", timeout=120.0) + print(result.transcript_text) + + except AuthenticationError: + print("Invalid API key") + except BatchError as e: + print(f"Job submission failed: {e}") + except JobError as e: + print(f"Job processing failed: {e}") + except TimeoutError as e: + print(f"Job timed out: {e}") + except FileNotFoundError: + print("Audio file not found") + +asyncio.run(main()) +``` + +### Connection Configuration + +```python +import asyncio +from speechmatics.batch import AsyncClient, ConnectionConfig + +async def main(): + # Custom connection settings + config = ConnectionConfig( + url="https://asr.api.speechmatics.com/v2", + api_key="your-api-key", + connect_timeout=30.0, + operation_timeout=600.0 + ) + + async with AsyncClient(conn_config=config) as client: + result = await client.transcribe("audio.wav") + print(result.transcript_text) + +asyncio.run(main()) +``` + +## Logging + +The client supports logging with job id tracing for debugging. To increase logging verbosity, set `DEBUG` level in your example code: + +```python +import logging +import sys + +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) +``` + +## Environment Variables + +The client supports the following environment variables: + +- `SPEECHMATICS_API_KEY`: Your Speechmatics API key +- `SPEECHMATICS_BATCH_URL`: Custom API endpoint URL (optional) diff --git a/sdk/TTS/pyproject.toml b/sdk/TTS/pyproject.toml new file mode 100644 index 0000000..065ccec --- /dev/null +++ b/sdk/TTS/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["setuptools>=61.0.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "speechmatics-tts" +dynamic = ["version"] +description = "Speechmatics TTS API Client" +readme = "README.md" +authors = [{ name = "Speechmatics", email = "support@speechmatics.com" }] +license = "MIT" +requires-python = ">=3.9" +dependencies = ["aiohttp", "aiofiles"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + "Topic :: Multimedia :: Sound/Audio :: Speech", + "Topic :: Software Development :: Libraries :: Python Modules", +] +keywords = ["speechmatics", "speech-to-text", "tts", "transcription", "api"] + +[project.optional-dependencies] +dev = [ + "black", + "ruff", + "mypy", + "types-aiofiles", + "pre-commit", + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-mock", + "build", +] + +[project.urls] +homepage = "https://github.com/speechmatics/speechmatics-python-sdk" +documentation = "https://docs.speechmatics.com/" +repository = "https://github.com/speechmatics/speechmatics-python-sdk" +issues = "https://github.com/speechmatics/speechmatics-python-sdk/issues" + +[tool.setuptools.dynamic] +version = { attr = "speechmatics.tts.__version__" } + +[tool.setuptools.packages.find] +where = ["."] diff --git a/sdk/TTS/speechmatics/__init__.py b/sdk/TTS/speechmatics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sdk/TTS/speechmatics/tts/__init__.py b/sdk/TTS/speechmatics/tts/__init__.py new file mode 100644 index 0000000..f488880 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/__init__.py @@ -0,0 +1,59 @@ +__version__ = "0.0.0" + +from ._async_client import AsyncClient +from ._auth import AuthBase +from ._auth import JWTAuth +from ._auth import StaticKeyAuth +from ._exceptions import AuthenticationError +from ._exceptions import BatchError +from ._exceptions import ConfigurationError +from ._exceptions import ConnectionError +from ._exceptions import JobError +from ._exceptions import TimeoutError +from ._exceptions import TransportError +from ._models import ConnectionConfig +from ._models import FetchData +from ._models import FormatType +from ._models import JobConfig +from ._models import JobDetails +from ._models import JobInfo +from ._models import JobStatus +from ._models import JobType +from ._models import NotificationConfig +from ._models import NotificationContents +from ._models import NotificationMethod +from ._models import OperatingPoint +from ._models import SummarizationConfig +from ._models import Transcript +from ._models import TranscriptionConfig +from ._models import TranslationConfig + +__all__ = [ + "AsyncClient", + "AuthBase", + "JWTAuth", + "StaticKeyAuth", + "ConfigurationError", + "AuthenticationError", + "ConnectionError", + "TransportError", + "BatchError", + "JobError", + "TimeoutError", + "JobConfig", + "JobDetails", + "JobInfo", + "NotificationConfig", + "NotificationMethod", + "NotificationContents", + "OperatingPoint", + "SummarizationConfig", + "Transcript", + "TranscriptionConfig", + "TranslationConfig", + "ConnectionConfig", + "JobStatus", + "JobType", + "FormatType", + "FetchData", +] diff --git a/sdk/TTS/speechmatics/tts/_async_client.py b/sdk/TTS/speechmatics/tts/_async_client.py new file mode 100644 index 0000000..46b8106 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_async_client.py @@ -0,0 +1,249 @@ +""" +Asynchronous client for Speechmatics batch transcription. + +This module provides the main AsyncClient class that handles batch +speech-to-text transcription using the Speechmatics Batch API. +""" + +from __future__ import annotations + +import asyncio +import os +import uuid +from typing import Any +from typing import BinaryIO +from typing import Optional +from typing import Union + +import aiohttp + +from ._auth import AuthBase +from ._auth import StaticKeyAuth +from ._exceptions import AuthenticationError +from ._exceptions import TimeoutError +from ._logging import get_logger +from ._models import ConnectionConfig + +from ._transport import Transport + + +class AsyncClient: + """ + Asynchronous client for Speechmatics batch speech transcription. + + This client provides a full-featured async interface to the Speechmatics Batch API, + supporting job submission, monitoring, and result retrieval with comprehensive + error management. It properly implements the Speechmatics REST API. + + The client handles the complete batch transcription workflow: + 1. Job submission with audio file and configuration + 2. Job status monitoring (with polling helpers) + 3. Result retrieval when transcription is complete + 4. Proper cleanup and error handling + + Args: + auth: Authentication instance. If not provided, uses StaticKeyAuth + with api_key parameter or SPEECHMATICS_API_KEY environment variable. + api_key: Speechmatics API key (used only if auth not provided). + url: REST API endpoint URL. If not provided, uses SPEECHMATICS_BATCH_URL + environment variable or defaults to production endpoint. + conn_config: Complete connection configuration object. If provided, overrides + other parameters. + + Raises: + ConfigurationError: If required configuration is missing or invalid. + + Examples: + Basic usage: + >>> async with AsyncClient(api_key="your-key") as client: + ... job = await client.submit_job("audio.wav") + ... result = await client.wait_for_completion(job.id) + ... print(result.transcript) + + With JWT authentication: + >>> from speechmatics.batch import JWTAuth + >>> auth = JWTAuth("your-api-key", ttl=3600) + >>> async with AsyncClient(auth=auth) as client: + ... # Use client with JWT auth + ... pass + """ + + def __init__( + self, + auth: Optional[AuthBase] = None, + *, + api_key: Optional[str] = None, + url: Optional[str] = None, + conn_config: Optional[ConnectionConfig] = None, + ) -> None: + """ + Initialize the AsyncClient. + + Args: + auth: Authentication method, it can be StaticKeyAuth or JWTAuth. + If None, creates StaticKeyAuth with the api_key. + api_key: Speechmatics API key. If None, uses SPEECHMATICS_API_KEY env var. + url: REST API endpoint URL. If None, uses SPEECHMATICS_BATCH_URL env var + or defaults to production endpoint. + conn_config: Complete connection configuration. + + Raises: + ConfigurationError: If auth is None and API key is not provided/found. + """ + self._auth = auth or StaticKeyAuth(api_key) + self._url = url or os.environ.get("SPEECHMATICS_TTS_URL") or "https://tts.api.speechmatics.com/v2" + self._conn_config = conn_config or ConnectionConfig() + self._request_id = str(uuid.uuid4()) + self._transport = Transport(self._url, self._conn_config, self._auth, self._request_id) + + self._logger = get_logger(__name__) + self._logger.debug("AsyncClient initialized (request_id=%s, url=%s)", self._request_id, self._url) + + async def __aenter__(self) -> AsyncClient: + """ + Async context manager entry. + + Returns: + Self for use in async with statements. + + Examples: + >>> async with AsyncClient(api_key="key") as client: + ... job = await client.submit_job("audio.wav") + """ + return self + + async def synthesize_speech( + self, + text: str, + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Convert text to speech audio. + + Args: + text: Text to convert to speech. + voice: Voice ID to use for synthesis (e.g., "en-US-neural-1"). + output_format: Audio format ("wav", "mp3", "ogg"). + sample_rate: Audio sample rate in Hz (e.g., 22050, 44100). + speed: Speech speed multiplier (0.5 to 2.0). + + Returns: + Audio data as bytes. + + Raises: + AuthenticationError: If API key is invalid. + TransportError: If synthesis fails. + + Examples: + >>> response = await client.synthesize_speech("Hello world") + >>> audio_data = await response.read() + >>> with open("output.wav", "wb") as f: + ... f.write(audio_data) + """ + # Prepare synthesis request + request_data = { + "text": text, + "output_format": output_format, + } + + if voice: + request_data["voice"] = voice + if sample_rate: + request_data["sample_rate"] = str(sample_rate) + if speed: + request_data["speed"] = str(speed) + + response = await self._transport.post("/synthesize", json_data=request_data) + return response + + async def synthesize_from_file( + self, + file_path: Union[str, os.PathLike], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Convert text from a file to speech audio. + + Args: + file_path: Path to text or SSML file. + voice: Voice ID to use for synthesis. + output_format: Audio format ("wav", "mp3", "ogg"). + sample_rate: Audio sample rate in Hz. + speed: Speech speed multiplier (0.5 to 2.0). + + Returns: + Raw aiohttp ClientResponse object. + + Raises: + FileNotFoundError: If file doesn't exist. + AuthenticationError: If API key is invalid. + TransportError: If synthesis fails. + + Examples: + >>> response = await client.synthesize_from_file("script.txt") + >>> audio_data = await response.read() + >>> with open("output.wav", "wb") as f: + ... f.write(audio_data) + """ + import aiofiles + from pathlib import Path + + file_path_obj = Path(file_path) + if not file_path_obj.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Read text content + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + text_content = await f.read() + + return await self.synthesize_speech( + text_content, + voice=voice, + output_format=output_format, + sample_rate=sample_rate, + speed=speed, + ) + + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + """ + Async context manager exit with automatic cleanup. + + Ensures all resources are properly cleaned up when exiting the + async context manager, including closing HTTP connections. + + Args: + exc_type: Exception type if an exception occurred. + exc_val: Exception value if an exception occurred. + exc_tb: Exception traceback if an exception occurred. + """ + await self.close() + + async def close(self) -> None: + """ + Close the client and cleanup all resources. + + This method ensures proper cleanup of all client resources including + closing HTTP connections and sessions. + + This method is safe to call multiple times and will handle cleanup + gracefully even if errors occur during the process. + + Examples: + >>> client = AsyncClient(api_key="key") + >>> try: + ... result = await client.transcribe("audio.wav") + >>> finally: + ... await client.close() + """ + try: + await self._transport.close() + except Exception: + pass # Best effort cleanup diff --git a/sdk/TTS/speechmatics/tts/_auth.py b/sdk/TTS/speechmatics/tts/_auth.py new file mode 100644 index 0000000..e71730a --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_auth.py @@ -0,0 +1,162 @@ +import abc +import asyncio +import os +import time +from typing import Literal +from typing import Optional + +from ._exceptions import AuthenticationError + + +class AuthBase(abc.ABC): + """ + Abstract base class for authentication methods. + """ + + BASE_URL = "https://mp.speechmatics.com" + + @abc.abstractmethod + async def get_auth_headers(self) -> dict[str, str]: + """ + Get authentication headers asynchronously. + + Returns: + A dictionary of authentication headers. + """ + raise NotImplementedError + + +class StaticKeyAuth(AuthBase): + """ + Authentication using a static API key. + + This is the traditional authentication method where the same + API key is used for all requests. + + Args: + api_key: The Speechmatics API key. + + Examples: + >>> auth = StaticKeyAuth("your-api-key") + >>> headers = await auth.get_auth_headers() + >>> print(headers) + {'Authorization': 'Bearer your-api-key'} + """ + + def __init__(self, api_key: Optional[str] = None): + self._api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY") + + if not self._api_key: + raise ValueError("API key required: provide api_key or set SPEECHMATICS_API_KEY") + + async def get_auth_headers(self) -> dict[str, str]: + return {"Authorization": f"Bearer {self._api_key}"} + + +class JWTAuth(AuthBase): + """ + Authentication using temporary JWT tokens. + + Generates short-lived JWTs for enhanced security. + + Args: + api_key: The main Speechmatics API key used to generate JWTs. + ttl: Time-to-live for tokens between 60 and 86400 seconds. + For security reasons, we suggest using the shortest TTL possible. + region: Self-Service customers are restricted to "eu". + Enterprise customers can use this to specify which region the temporary key should be enabled in. + client_ref: Optional client reference for JWT token. + This parameter must be used if the temporary keys are exposed to the end-user's client + to prevent a user from accessing the data of a different user. + mp_url: Optional management platform URL override. + request_id: Optional request ID for debugging purposes. + + Examples: + >>> auth = JWTAuth("your-api-key") + >>> headers = await auth.get_auth_headers() + >>> print(headers) + {'Authorization': 'Bearer eyJhbGciOiJSUzI1NiIs...'} + """ + + def __init__( + self, + api_key: Optional[str] = None, + *, + ttl: int = 60, + region: Literal["eu", "usa", "au"] = "eu", + client_ref: Optional[str] = None, + mp_url: Optional[str] = None, + request_id: Optional[str] = None, + ): + self._api_key = api_key or os.environ.get("SPEECHMATICS_API_KEY") + self._ttl = ttl + self._region = region + self._client_ref = client_ref + self._request_id = request_id + self._mp_url = mp_url or os.getenv("SM_MANAGEMENT_PLATFORM_URL", self.BASE_URL) + + if not self._api_key: + raise ValueError( + "API key required: please provide api_key or set SPEECHMATICS_API_KEY environment variable" + ) + + if not 60 <= self._ttl <= 86_400: + raise ValueError("ttl must be between 60 and 86400 seconds") + + self._cached_token: Optional[str] = None + self._token_expires_at: float = 0 + self._token_lock = asyncio.Lock() + + async def get_auth_headers(self) -> dict[str, str]: + """Get JWT auth headers with caching.""" + async with self._token_lock: + current_time = time.time() + if current_time >= self._token_expires_at - 10: + self._cached_token = await self._generate_token() + self._token_expires_at = current_time + self._ttl + + return {"Authorization": f"Bearer {self._cached_token}"} + + async def _generate_token(self) -> str: + try: + import aiohttp + except ImportError: + raise ImportError( + "aiohttp is required for JWT authentication. Please install it with `pip install 'speechmatics-batch[jwt]'`" + ) + + endpoint = f"{self._mp_url}/v1/api_keys" + params = {"type": "batch"} + payload = {"ttl": self._ttl, "region": str(self._region)} + + if self._client_ref: + payload["client_ref"] = self._client_ref + + headers = { + "Authorization": f"Bearer {self._api_key}", + "Content-Type": "application/json", + } + + if self._request_id: + headers["X-Request-Id"] = self._request_id + + try: + async with aiohttp.ClientSession() as session: + async with session.post( + endpoint, + params=params, + json=payload, + headers=headers, + timeout=aiohttp.ClientTimeout(total=10), + ) as response: + if response.status != 201: + text = await response.text() + raise AuthenticationError(f"Failed to generate JWT: HTTP {response.status}: {text}") + + data = await response.json() + return str(data["key_value"]) + + except aiohttp.ClientError as e: + raise AuthenticationError(f"Network error generating JWT: {e}") + except Exception as e: + raise AuthenticationError(f"Unexpected error generating JWT: {e}") diff --git a/sdk/TTS/speechmatics/tts/_exceptions.py b/sdk/TTS/speechmatics/tts/_exceptions.py new file mode 100644 index 0000000..173c400 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_exceptions.py @@ -0,0 +1,40 @@ +class ConfigurationError(Exception): + """Raised when there's an error in configuration.""" + + pass + + +class AuthenticationError(Exception): + """Raised when authentication fails.""" + + pass + + +class ConnectionError(Exception): + """Raised when connection to the service fails.""" + + pass + + +class TransportError(Exception): + """Raised when there's an error in the transport layer.""" + + pass + + +class BatchError(Exception): + """Raised when batch processing fails.""" + + pass + + +class JobError(Exception): + """Raised when there's an error with a job.""" + + pass + + +class TimeoutError(Exception): + """Raised when an operation times out.""" + + pass diff --git a/sdk/TTS/speechmatics/tts/_helpers.py b/sdk/TTS/speechmatics/tts/_helpers.py new file mode 100644 index 0000000..443e49a --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_helpers.py @@ -0,0 +1,57 @@ +""" +Utility functions for the Speechmatics Batch SDK. +""" + +from __future__ import annotations + +import importlib.metadata +import os +from collections.abc import AsyncGenerator +from contextlib import asynccontextmanager +from typing import BinaryIO +from typing import Union + +import aiofiles + + +@asynccontextmanager +async def prepare_audio_file( + audio_file: Union[str, BinaryIO], +) -> AsyncGenerator[tuple[str, Union[BinaryIO, bytes]], None]: + """ + Async context manager for file handling with proper resource management. + + Args: + audio_file: Path to audio file or file-like object containing audio data. + + Yields: + Tuple of (filename, file_data) + + Examples: + >>> async with prepare_audio_file("audio.wav") as (filename, file_data): + ... # Use file_data for upload + ... pass + """ + if isinstance(audio_file, str): + async with aiofiles.open(audio_file, "rb") as f: + content = await f.read() + filename = os.path.basename(audio_file) + yield filename, content + else: + # It's already a file-like object + filename = getattr(audio_file, "name", "audio.wav") + if hasattr(filename, "split"): + filename = os.path.basename(filename) + yield filename, audio_file + + +def get_version() -> str: + try: + return importlib.metadata.version("speechmatics-batch") + except importlib.metadata.PackageNotFoundError: + try: + from . import __version__ + + return __version__ + except ImportError: + return "0.0.0" diff --git a/sdk/TTS/speechmatics/tts/_logging.py b/sdk/TTS/speechmatics/tts/_logging.py new file mode 100644 index 0000000..e63bda6 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_logging.py @@ -0,0 +1,49 @@ +import logging + +logger = logging.getLogger(__name__) +logger.addHandler(logging.NullHandler()) + + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger that stays silent by default. + + The logger uses Python's standard logging module and includes NullHandler + by default to avoid unwanted output. Users can configure logging levels + and handlers as needed. + + Args: + name: Logger name, typically __name__ from the calling module. + + Returns: + Configured logger instance. + + Examples: + Basic usage in SDK modules: + logger = get_logger(__name__) + logger.debug("HTTP request sent %s %s", method, url) + logger.info("Job submitted (job_id=%s)", job_id) + logger.warning("Job failed (job_id=%s): %s", job_id, error) + logger.error("Connection failed: %s", e) + + Enable debug logging in user code: + import logging + logging.basicConfig(level=logging.DEBUG) + # Now all SDK debug messages will be visible + + Custom logging configuration: + import logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Or for specific components: + logging.getLogger('speechmatics.batch').setLevel(logging.DEBUG) + """ + module_logger = logging.getLogger(name) + module_logger.addHandler(logging.NullHandler()) + return module_logger + + +__all__ = ["get_logger"] diff --git a/sdk/TTS/speechmatics/tts/_models.py b/sdk/TTS/speechmatics/tts/_models.py new file mode 100644 index 0000000..71c6bcb --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_models.py @@ -0,0 +1,31 @@ +""" +Models for the Speechmatics TTS SDK. + +This module contains all data models, enums, and configuration classes used +throughout the Speechmatics TTS SDK. These models +provide type-safe interfaces for configuration, job management, and +result handling based on the official Speechmatics API schema. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + + + + +@dataclass +class ConnectionConfig: + """ + Configuration for HTTP connection parameters. + + This class defines connection-related settings and timeouts. + + Attributes: + connect_timeout: Timeout in seconds for connection establishment. + operation_timeout: Default timeout for API operations. + """ + + connect_timeout: float = 30.0 + operation_timeout: float = 300.0 diff --git a/sdk/TTS/speechmatics/tts/_realtime_streaming_client.py b/sdk/TTS/speechmatics/tts/_realtime_streaming_client.py new file mode 100644 index 0000000..38ceec5 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_realtime_streaming_client.py @@ -0,0 +1,440 @@ +""" +Real-time streaming TTS client with live text parsing and synthesis. + +This module provides true real-time TTS where audio generation happens +simultaneously with text parsing, providing immediate audio output +as text becomes available. +""" + +from __future__ import annotations + +import asyncio +import json +import re +from typing import AsyncGenerator, Optional, Union, List, Dict, Any, AsyncIterator +from pathlib import Path +import time + +import aiohttp +import aiofiles + +from ._auth import AuthBase, StaticKeyAuth +from ._exceptions import AuthenticationError, TransportError +from ._logging import get_logger +from ._models import ConnectionConfig + + +class RealTimeStreamingTTSClient: + """ + Real-time streaming TTS client with live text parsing. + + This client provides true real-time TTS where: + - Text is parsed/read incrementally from source + - Audio synthesis happens immediately as text becomes available + - Audio chunks are delivered while more text is still being processed + - No waiting for complete text parsing before synthesis begins + + Examples: + Live file parsing with immediate audio: + >>> async with RealTimeStreamingTTSClient(api_key="key") as client: + ... async for audio_chunk in client.stream_live_from_file("book.txt"): + ... await play_audio_immediately(audio_chunk) + ... # Audio plays while file is still being read! + + Real-time text input streaming: + >>> async for audio_chunk in client.stream_live_text_input(): + ... await play_audio_chunk(audio_chunk) + ... # Audio generated as user types! + """ + + def __init__( + self, + auth: Optional[AuthBase] = None, + *, + api_key: Optional[str] = None, + url: Optional[str] = None, + conn_config: Optional[ConnectionConfig] = None, + parse_chunk_size: int = 256, # Smaller for real-time parsing + synthesis_threshold: int = 100, # Start synthesis after this many chars + max_parse_delay: float = 0.1, # Max delay between parse chunks (seconds) + ) -> None: + """Initialize real-time streaming TTS client.""" + self._auth = auth or StaticKeyAuth(api_key) + self._url = url or "wss://tts.api.speechmatics.com/v1" + self._conn_config = conn_config or ConnectionConfig() + + # Real-time parsing configuration + self._parse_chunk_size = parse_chunk_size + self._synthesis_threshold = synthesis_threshold + self._max_parse_delay = max_parse_delay + + self._logger = get_logger(__name__) + self._session: Optional[aiohttp.ClientSession] = None + self._websocket: Optional[aiohttp.ClientWebSocketResponse] = None + + async def __aenter__(self) -> RealTimeStreamingTTSClient: + """Async context manager entry.""" + await self._ensure_session() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit with cleanup.""" + await self.close() + + async def _ensure_session(self) -> None: + """Ensure aiohttp session is created.""" + if not self._session: + self._session = aiohttp.ClientSession() + + async def close(self) -> None: + """Close all connections and cleanup resources.""" + if self._websocket: + await self._websocket.close() + self._websocket = None + + if self._session: + await self._session.close() + self._session = None + + async def _connect_websocket(self) -> aiohttp.ClientWebSocketResponse: + """Establish WebSocket connection for real-time streaming.""" + await self._ensure_session() + + headers = await self._auth.get_auth_headers() + + self._websocket = await self._session.ws_connect( + f"{self._url}/realtime-synthesize", + headers=headers, + heartbeat=30 + ) + + return self._websocket + + async def _find_synthesis_boundary(self, text: str) -> int: + """ + Find optimal boundary for synthesis (sentence/phrase end). + + Returns position where synthesis should start, or -1 if not ready. + """ + if len(text) < self._synthesis_threshold: + return -1 + + # Look for sentence endings first + sentence_patterns = [r'[.!?]\s+', r'[.!?]$'] + for pattern in sentence_patterns: + matches = list(re.finditer(pattern, text)) + if matches: + return matches[-1].end() + + # Look for phrase boundaries + phrase_patterns = [r'[,;:]\s+', r'\s+and\s+', r'\s+but\s+', r'\s+or\s+'] + for pattern in phrase_patterns: + matches = list(re.finditer(pattern, text)) + if matches and len(text[:matches[-1].end()]) >= self._synthesis_threshold: + return matches[-1].end() + + # If text is long enough, find word boundary + if len(text) >= self._synthesis_threshold * 2: + word_boundaries = [m.start() for m in re.finditer(r'\s+', text)] + if word_boundaries: + # Find boundary closest to middle + target = len(text) // 2 + closest = min(word_boundaries, key=lambda x: abs(x - target)) + return closest + + return -1 + + async def stream_live_from_file( + self, + file_path: Union[str, Path], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + read_delay: float = 0.05, # Simulate real-time reading + ) -> AsyncGenerator[bytes, None]: + """ + Stream TTS with real-time file parsing and immediate synthesis. + + This method reads the file incrementally and starts synthesis + as soon as enough text is available, without waiting for + complete file reading. + + Args: + file_path: Path to text file + read_delay: Delay between file reads (simulates real-time input) + + Yields: + Audio chunks as they're generated from live parsing + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + ws = await self._connect_websocket() + + try: + # Send synthesis configuration + config = { + "type": "start_realtime_synthesis", + "voice": voice, + "output_format": output_format, + "sample_rate": sample_rate, + "speed": speed, + "realtime_mode": True, + } + await ws.send_str(json.dumps({k: v for k, v in config.items() if v is not None})) + + # Start concurrent tasks for parsing and audio collection + parse_task = asyncio.create_task( + self._parse_file_realtime(ws, file_path, read_delay) + ) + + # Yield audio chunks as they become available + async for audio_chunk in self._collect_realtime_audio(ws): + yield audio_chunk + + finally: + # Cleanup + if not parse_task.done(): + parse_task.cancel() + await ws.send_str(json.dumps({"type": "end_realtime_synthesis"})) + + async def _parse_file_realtime( + self, + ws: aiohttp.ClientWebSocketResponse, + file_path: Path, + read_delay: float + ) -> None: + """Parse file in real-time and send text chunks for immediate synthesis.""" + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + text_buffer = "" + chunk_id = 0 + + while True: + # Read small chunk from file + file_chunk = await f.read(self._parse_chunk_size) + if not file_chunk: + break + + text_buffer += file_chunk + self._logger.debug(f"Read chunk: {len(file_chunk)} chars, buffer: {len(text_buffer)} chars") + + # Check if we have enough text for synthesis + synthesis_pos = await self._find_synthesis_boundary(text_buffer) + + if synthesis_pos > 0: + # Extract text for synthesis + synthesis_text = text_buffer[:synthesis_pos].strip() + text_buffer = text_buffer[synthesis_pos:] + + if synthesis_text: + # Send for immediate synthesis + message = { + "type": "realtime_text_chunk", + "chunk_id": chunk_id, + "text": synthesis_text, + "timestamp": time.time(), + } + await ws.send_str(json.dumps(message)) + chunk_id += 1 + + self._logger.info(f"Sent for synthesis: '{synthesis_text[:50]}...' ({len(synthesis_text)} chars)") + + # Simulate real-time reading delay + await asyncio.sleep(read_delay) + + # Process remaining buffer + if text_buffer.strip(): + message = { + "type": "realtime_text_chunk", + "chunk_id": chunk_id, + "text": text_buffer.strip(), + "timestamp": time.time(), + "final": True, + } + await ws.send_str(json.dumps(message)) + + async def _collect_realtime_audio( + self, + ws: aiohttp.ClientWebSocketResponse + ) -> AsyncGenerator[bytes, None]: + """Collect audio chunks in real-time as they're generated.""" + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + data = json.loads(msg.data) + + if data.get("type") == "realtime_audio_chunk": + # Decode and yield audio immediately + import base64 + audio_data = base64.b64decode(data["audio"]) + + # Log timing information + chunk_id = data.get("chunk_id", "unknown") + latency = time.time() - data.get("synthesis_start", time.time()) + self._logger.info(f"Audio chunk {chunk_id}: {len(audio_data)} bytes (latency: {latency:.3f}s)") + + yield audio_data + + elif data.get("type") == "synthesis_complete": + self._logger.info("Real-time synthesis complete") + break + + elif data.get("type") == "error": + raise TransportError(f"Real-time synthesis error: {data.get('message')}") + + elif msg.type == aiohttp.WSMsgType.ERROR: + raise TransportError(f"WebSocket error: {ws.exception()}") + + async def stream_live_text_input( + self, + text_source: AsyncIterator[str], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> AsyncGenerator[bytes, None]: + """ + Stream TTS from live text input (e.g., user typing, live transcription). + + Args: + text_source: Async iterator providing text chunks as they become available + + Yields: + Audio chunks generated from live text input + + Examples: + >>> async def user_typing(): + ... # Simulate user typing + ... words = "Hello world this is live typing".split() + ... for word in words: + ... yield word + " " + ... await asyncio.sleep(0.5) # Typing delay + ... + >>> async for audio in client.stream_live_text_input(user_typing()): + ... await play_audio_immediately(audio) + """ + ws = await self._connect_websocket() + + try: + # Send configuration + config = { + "type": "start_live_input_synthesis", + "voice": voice, + "output_format": output_format, + "sample_rate": sample_rate, + "speed": speed, + "live_input_mode": True, + } + await ws.send_str(json.dumps({k: v for k, v in config.items() if v is not None})) + + # Start concurrent processing + input_task = asyncio.create_task( + self._process_live_input(ws, text_source) + ) + + # Yield audio as it becomes available + async for audio_chunk in self._collect_realtime_audio(ws): + yield audio_chunk + + finally: + if not input_task.done(): + input_task.cancel() + await ws.send_str(json.dumps({"type": "end_live_input_synthesis"})) + + async def _process_live_input( + self, + ws: aiohttp.ClientWebSocketResponse, + text_source: AsyncIterator[str] + ) -> None: + """Process live text input and send for immediate synthesis.""" + text_buffer = "" + chunk_id = 0 + + async for text_chunk in text_source: + text_buffer += text_chunk + self._logger.debug(f"Received input: '{text_chunk}', buffer: '{text_buffer}'") + + # Check for synthesis opportunity + synthesis_pos = await self._find_synthesis_boundary(text_buffer) + + if synthesis_pos > 0: + synthesis_text = text_buffer[:synthesis_pos].strip() + text_buffer = text_buffer[synthesis_pos:] + + if synthesis_text: + message = { + "type": "live_input_chunk", + "chunk_id": chunk_id, + "text": synthesis_text, + "timestamp": time.time(), + } + await ws.send_str(json.dumps(message)) + chunk_id += 1 + + self._logger.info(f"Live synthesis: '{synthesis_text}'") + + # Process final buffer + if text_buffer.strip(): + message = { + "type": "live_input_chunk", + "chunk_id": chunk_id, + "text": text_buffer.strip(), + "timestamp": time.time(), + "final": True, + } + await ws.send_str(json.dumps(message)) + + async def stream_with_live_monitoring( + self, + file_path: Union[str, Path], + *, + voice: Optional[str] = None, + output_format: str = "wav", + monitor_interval: float = 0.1, + ) -> AsyncGenerator[Dict[str, Any], None]: + """ + Stream with real-time monitoring and metrics. + + Yields dictionaries containing: + - audio_chunk: The audio bytes + - metrics: Real-time performance metrics + - status: Current processing status + """ + start_time = time.time() + total_chars_processed = 0 + total_audio_bytes = 0 + chunk_count = 0 + + async for audio_chunk in self.stream_live_from_file( + file_path, + voice=voice, + output_format=output_format + ): + chunk_count += 1 + total_audio_bytes += len(audio_chunk) + + # Calculate real-time metrics + elapsed_time = time.time() - start_time + processing_rate = total_chars_processed / elapsed_time if elapsed_time > 0 else 0 + audio_rate = total_audio_bytes / elapsed_time if elapsed_time > 0 else 0 + + yield { + "audio_chunk": audio_chunk, + "metrics": { + "chunk_id": chunk_count, + "chunk_size_bytes": len(audio_chunk), + "total_audio_bytes": total_audio_bytes, + "elapsed_time": elapsed_time, + "chars_per_second": processing_rate, + "audio_bytes_per_second": audio_rate, + "estimated_audio_duration": total_audio_bytes / 44100 / 2, # Rough estimate + }, + "status": { + "processing": True, + "real_time_ratio": processing_rate / 150 if processing_rate > 0 else 0, # Assuming 150 chars/sec reading + "buffer_health": "good", # Could be calculated based on actual buffer status + } + } diff --git a/sdk/TTS/speechmatics/tts/_streaming_client.py b/sdk/TTS/speechmatics/tts/_streaming_client.py new file mode 100644 index 0000000..eb927ff --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_streaming_client.py @@ -0,0 +1,397 @@ +""" +Streaming TTS client for real-time text-to-speech synthesis. + +This module provides streaming text-to-speech capabilities with chunked processing, +real-time audio delivery, and WebSocket/SSE support for incremental synthesis. +""" + +from __future__ import annotations + +import asyncio +import json +import re +from typing import AsyncGenerator, Optional, Union, List, Dict, Any +from pathlib import Path + +import aiohttp +import aiofiles + +from ._auth import AuthBase, StaticKeyAuth +from ._exceptions import AuthenticationError, TransportError +from ._logging import get_logger +from ._models import ConnectionConfig +from ._transport import Transport + + +class StreamingTTSClient: + """ + Streaming Text-to-Speech client with real-time audio generation. + + This client provides streaming TTS capabilities including: + - Chunked text processing for large documents + - Real-time audio streaming via WebSocket/SSE + - Incremental audio delivery as chunks are synthesized + - Support for both file and text input streaming + + Args: + auth: Authentication instance + api_key: Speechmatics API key (if auth not provided) + url: TTS API endpoint URL + conn_config: Connection configuration + + Examples: + Basic streaming: + >>> async with StreamingTTSClient(api_key="key") as client: + ... async for audio_chunk in client.stream_speech("Long text..."): + ... # Play audio_chunk in real-time + ... await play_audio(audio_chunk) + + File streaming: + >>> async for audio_chunk in client.stream_from_file("book.txt"): + ... await save_audio_chunk(audio_chunk) + """ + + def __init__( + self, + auth: Optional[AuthBase] = None, + *, + api_key: Optional[str] = None, + url: Optional[str] = None, + conn_config: Optional[ConnectionConfig] = None, + chunk_size: int = 500, + overlap_size: int = 50, + ) -> None: + """Initialize streaming TTS client.""" + self._auth = auth or StaticKeyAuth(api_key) + self._url = url or "wss://tts.api.speechmatics.com/v1" # WebSocket endpoint + self._conn_config = conn_config or ConnectionConfig() + self._chunk_size = chunk_size # Characters per chunk + self._overlap_size = overlap_size # Overlap to prevent word cuts + + self._logger = get_logger(__name__) + self._session: Optional[aiohttp.ClientSession] = None + self._websocket: Optional[aiohttp.ClientWebSocketResponse] = None + + async def __aenter__(self) -> StreamingTTSClient: + """Async context manager entry.""" + await self._ensure_session() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Async context manager exit with cleanup.""" + await self.close() + + async def _ensure_session(self) -> None: + """Ensure aiohttp session is created.""" + if not self._session: + self._session = aiohttp.ClientSession() + + async def close(self) -> None: + """Close all connections and cleanup resources.""" + if self._websocket: + await self._websocket.close() + self._websocket = None + + if self._session: + await self._session.close() + self._session = None + + def _chunk_text(self, text: str) -> List[str]: + """ + Break text into chunks with smart sentence/word boundaries. + + Args: + text: Input text to chunk + + Returns: + List of text chunks with overlap for continuity + """ + if len(text) <= self._chunk_size: + return [text] + + chunks = [] + start = 0 + + while start < len(text): + end = start + self._chunk_size + + if end >= len(text): + # Last chunk + chunks.append(text[start:]) + break + + # Find good break point (sentence end, then word boundary) + chunk_text = text[start:end + self._overlap_size] + + # Look for sentence endings + sentence_breaks = [m.end() for m in re.finditer(r'[.!?]\s+', chunk_text)] + if sentence_breaks: + break_point = sentence_breaks[-1] + else: + # Fall back to word boundary + word_breaks = [m.start() for m in re.finditer(r'\s+', chunk_text)] + if word_breaks: + break_point = word_breaks[-1] + else: + break_point = self._chunk_size + + actual_end = start + break_point + chunks.append(text[start:actual_end]) + start = actual_end - self._overlap_size # Overlap for continuity + + return chunks + + async def _connect_websocket(self) -> aiohttp.ClientWebSocketResponse: + """Establish WebSocket connection for streaming.""" + await self._ensure_session() + + headers = await self._auth.get_headers() + + self._websocket = await self._session.ws_connect( + f"{self._url}/stream-synthesize", + headers=headers, + heartbeat=30 + ) + + return self._websocket + + async def stream_speech( + self, + text: str, + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> AsyncGenerator[bytes, None]: + """ + Stream text-to-speech with chunked processing. + + Args: + text: Text to convert to speech + voice: Voice ID for synthesis + output_format: Audio format ("wav", "mp3", "ogg") + sample_rate: Audio sample rate in Hz + speed: Speech speed multiplier + + Yields: + Audio chunks as bytes as they're generated + + Examples: + >>> async for audio_chunk in client.stream_speech("Hello world"): + ... await play_audio_chunk(audio_chunk) + """ + chunks = self._chunk_text(text) + self._logger.info(f"Processing {len(chunks)} text chunks") + + ws = await self._connect_websocket() + + try: + # Send synthesis configuration + config = { + "type": "start_synthesis", + "voice": voice, + "output_format": output_format, + "sample_rate": sample_rate, + "speed": speed, + } + await ws.send_str(json.dumps({k: v for k, v in config.items() if v is not None})) + + # Process chunks concurrently for better performance + chunk_tasks = [] + for i, chunk in enumerate(chunks): + task = asyncio.create_task(self._process_chunk(ws, chunk, i)) + chunk_tasks.append(task) + + # Yield audio chunks as they become available + async for audio_chunk in self._collect_audio_chunks(ws, len(chunks)): + yield audio_chunk + + finally: + # Send end signal + await ws.send_str(json.dumps({"type": "end_synthesis"})) + + async def _process_chunk(self, ws: aiohttp.ClientWebSocketResponse, chunk: str, chunk_id: int) -> None: + """Process a single text chunk.""" + message = { + "type": "text_chunk", + "chunk_id": chunk_id, + "text": chunk, + } + await ws.send_str(json.dumps(message)) + + async def _collect_audio_chunks( + self, + ws: aiohttp.ClientWebSocketResponse, + expected_chunks: int + ) -> AsyncGenerator[bytes, None]: + """Collect and yield audio chunks from WebSocket.""" + received_chunks = 0 + + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + data = json.loads(msg.data) + + if data.get("type") == "audio_chunk": + # Decode base64 audio data + import base64 + audio_data = base64.b64decode(data["audio"]) + yield audio_data + + elif data.get("type") == "chunk_complete": + received_chunks += 1 + if received_chunks >= expected_chunks: + break + + elif data.get("type") == "error": + raise TransportError(f"Synthesis error: {data.get('message')}") + + elif msg.type == aiohttp.WSMsgType.ERROR: + raise TransportError(f"WebSocket error: {ws.exception()}") + + async def stream_from_file( + self, + file_path: Union[str, Path], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + ) -> AsyncGenerator[bytes, None]: + """ + Stream TTS from a text file with real-time processing. + + Args: + file_path: Path to text file + voice: Voice ID for synthesis + output_format: Audio format + sample_rate: Audio sample rate + speed: Speech speed multiplier + + Yields: + Audio chunks as they're generated from file content + + Examples: + >>> async for audio_chunk in client.stream_from_file("book.txt"): + ... await save_audio_chunk(audio_chunk) + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + # Read file content asynchronously + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + text_content = await f.read() + + # Stream the file content + async for audio_chunk in self.stream_speech( + text_content, + voice=voice, + output_format=output_format, + sample_rate=sample_rate, + speed=speed, + ): + yield audio_chunk + + async def stream_from_file_incremental( + self, + file_path: Union[str, Path], + *, + voice: Optional[str] = None, + output_format: str = "wav", + sample_rate: Optional[int] = None, + speed: Optional[float] = None, + read_chunk_size: int = 8192, + ) -> AsyncGenerator[bytes, None]: + """ + Stream TTS from file with incremental reading for very large files. + + This method reads the file in chunks and processes them as they're read, + providing true streaming for massive text files. + + Args: + file_path: Path to text file + read_chunk_size: Size of file read chunks in bytes + + Yields: + Audio chunks as file is read and processed + """ + file_path = Path(file_path) + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + ws = await self._connect_websocket() + + try: + # Send synthesis configuration + config = { + "type": "start_synthesis", + "voice": voice, + "output_format": output_format, + "sample_rate": sample_rate, + "speed": speed, + } + await ws.send_str(json.dumps({k: v for k, v in config.items() if v is not None})) + + # Read and process file incrementally + async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: + buffer = "" + chunk_id = 0 + + while True: + # Read file chunk + file_chunk = await f.read(read_chunk_size) + if not file_chunk: + break + + buffer += file_chunk + + # Process complete sentences from buffer + while len(buffer) > self._chunk_size: + # Find sentence boundary + sentence_end = buffer.rfind('.', 0, self._chunk_size) + if sentence_end == -1: + sentence_end = buffer.rfind(' ', 0, self._chunk_size) + if sentence_end == -1: + sentence_end = self._chunk_size + + text_chunk = buffer[:sentence_end + 1] + buffer = buffer[sentence_end + 1:] + + # Send chunk for processing + await self._process_chunk(ws, text_chunk, chunk_id) + chunk_id += 1 + + # Yield any available audio + try: + msg = await asyncio.wait_for(ws.receive(), timeout=0.1) + if msg.type == aiohttp.WSMsgType.TEXT: + data = json.loads(msg.data) + if data.get("type") == "audio_chunk": + import base64 + audio_data = base64.b64decode(data["audio"]) + yield audio_data + except asyncio.TimeoutError: + pass # No audio ready yet + + # Process remaining buffer + if buffer.strip(): + await self._process_chunk(ws, buffer, chunk_id) + + # Collect remaining audio chunks + await ws.send_str(json.dumps({"type": "end_synthesis"})) + + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + data = json.loads(msg.data) + if data.get("type") == "audio_chunk": + import base64 + audio_data = base64.b64decode(data["audio"]) + yield audio_data + elif data.get("type") == "synthesis_complete": + break + + finally: + if self._websocket: + await self._websocket.close() + self._websocket = None diff --git a/sdk/TTS/speechmatics/tts/_transport.py b/sdk/TTS/speechmatics/tts/_transport.py new file mode 100644 index 0000000..f522fb4 --- /dev/null +++ b/sdk/TTS/speechmatics/tts/_transport.py @@ -0,0 +1,306 @@ +""" +Transport layer for Speechmatics Batch HTTP communication. + +This module provides the Transport class that handles low-level HTTP +communication with the Speechmatics Batch API, including connection management, +request/response handling, and authentication. +""" + +from __future__ import annotations + +import asyncio +import io +import sys +import uuid +from typing import Any +from typing import Optional + +import aiohttp + +from ._auth import AuthBase +from ._exceptions import AuthenticationError +from ._exceptions import ConnectionError +from ._exceptions import TransportError +from ._helpers import get_version +from ._logging import get_logger +from ._models import ConnectionConfig + + +class Transport: + """ + HTTP transport layer for Speechmatics Batch API communication. + + This class handles all low-level HTTP communication with the Speechmatics + Batch API, including connection management, request serialization, + authentication, and response handling. + + Args: + url: Base URL for the Speechmatics Batch API. + conn_config: Connection configuration including URL and timeouts. + auth: Authentication instance for handling credentials. + request_id: Optional unique identifier for request tracking. Generated + automatically if not provided. + + Attributes: + conn_config: The connection configuration object. + request_id: Unique identifier for this transport instance. + + Examples: + Basic usage: + >>> from ._auth import StaticKeyAuth + >>> conn_config = ConnectionConfig() + >>> auth = StaticKeyAuth("your-api-key") + >>> transport = Transport(conn_config, auth) + >>> response = await transport.get("/jobs") + >>> await transport.close() + """ + + def __init__( + self, + url: str, + conn_config: ConnectionConfig, + auth: AuthBase, + request_id: Optional[str] = None, + ) -> None: + """ + Initialize the transport with connection configuration. + + Args: + conn_config: Connection configuration object containing connection parameters. + auth: Authentication instance for handling credentials. + request_id: Optional unique identifier for request tracking. + Generated automatically if not provided. + """ + self._url = url + self._conn_config = conn_config + self._auth = auth + self._request_id = request_id or str(uuid.uuid4()) + self._session: Optional[aiohttp.ClientSession] = None + self._closed = False + self._logger = get_logger(__name__) + + self._logger.debug("Transport initialized (request_id=%s, url=%s)", self._request_id, self._url) + + async def __aenter__(self) -> Transport: + """Async context manager entry.""" + await self._ensure_session() + return self + + async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + """Async context manager exit with automatic cleanup.""" + await self.close() + + + async def post( + self, + path: str, + json_data: Optional[dict[str, Any]] = None, + multipart_data: Optional[dict[str, Any]] = None, + timeout: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Send POST request to the API. + + Args: + path: API endpoint path + json_data: Optional JSON data for request body + multipart_data: Optional multipart form data + timeout: Optional request timeout + + Returns: + HTTP response object + + Raises: + AuthenticationError: If authentication fails + TransportError: If request fails + """ + return await self._request("POST", path, json_data=json_data, multipart_data=multipart_data, timeout=timeout) + + + + async def close(self) -> None: + """ + Close the HTTP session and cleanup resources. + + This method gracefully closes the HTTP session and marks the + transport as closed. It's safe to call multiple times. + """ + if self._session: + try: + await self._session.close() + except Exception: + pass # Best effort cleanup + finally: + self._session = None + self._closed = True + + @property + def is_connected(self) -> bool: + """ + Check if the transport has an active session. + + Returns: + True if session is active, False otherwise + """ + return self._session is not None and not self._closed + + async def _ensure_session(self) -> None: + """Ensure HTTP session is created.""" + if self._session is None and not self._closed: + self._logger.debug( + "Creating HTTP session (connect_timeout=%.1fs, operation_timeout=%.1fs)", + self._conn_config.connect_timeout, + self._conn_config.operation_timeout, + ) + timeout = aiohttp.ClientTimeout( + total=self._conn_config.operation_timeout, + connect=self._conn_config.connect_timeout, + ) + self._session = aiohttp.ClientSession(timeout=timeout) + + async def _request( + self, + method: str, + path: str, + params: Optional[dict[str, Any]] = None, + json_data: Optional[dict[str, Any]] = None, + multipart_data: Optional[dict[str, Any]] = None, + timeout: Optional[float] = None, + ) -> aiohttp.ClientResponse: + """ + Send HTTP request to the API. + + Args: + method: HTTP method (GET, POST, DELETE) + path: API endpoint path + params: Optional query parameters + json_data: Optional JSON data for request body + multipart_data: Optional multipart form data + timeout: Optional request timeout + + Returns: + HTTP response object + + Raises: + AuthenticationError: If authentication fails + ConnectionError: If connection fails + TransportError: For other transport errors + """ + await self._ensure_session() + + if self._session is None: + raise ConnectionError("Failed to create HTTP session") + + url = f"{self._url.rstrip('/')}{path}" + headers = await self._prepare_headers() + + self._logger.debug( + "Sending HTTP request %s %s (json=%s, multipart=%s)", + method, + url, + json_data is not None, + multipart_data is not None, + ) + + # Override timeout if specified + if timeout: + request_timeout = aiohttp.ClientTimeout(total=timeout) + else: + request_timeout = None + + try: + # Prepare request arguments + kwargs: dict[str, Any] = { + "headers": headers, + "params": params, + "timeout": request_timeout, + } + + if json_data: + kwargs["json"] = json_data + elif multipart_data: + # Force multipart encoding even when no files are present (for fetch_data support) + form_data = aiohttp.FormData(default_to_multipart=True) + for key, value in multipart_data.items(): + if isinstance(value, tuple) and len(value) == 3: + # File data: (filename, file_data, content_type) + filename, file_data, content_type = value + # aiohttp cannot serialize io.BytesIO directly; convert to bytes + if isinstance(file_data, io.BytesIO): + file_payload = file_data.getvalue() + else: + file_payload = file_data + form_data.add_field(key, file_payload, filename=filename, content_type=content_type) + else: + # Regular form field + if isinstance(value, dict): + import json + + value = json.dumps(value) + form_data.add_field(key, value) + kwargs["data"] = form_data + + async with self._session.request(method, url, **kwargs) as response: + return await self._handle_response(response) + + except asyncio.TimeoutError: + self._logger.error( + "Request timeout %s %s (timeout=%.1fs)", method, path, self._conn_config.operation_timeout + ) + raise TransportError(f"Request timeout for {method} {path}") from None + except aiohttp.ClientError as e: + self._logger.error("Request failed %s %s: %s", method, path, e) + raise ConnectionError(f"Request failed: {e}") from e + except Exception as e: + self._logger.error("Unexpected error %s %s: %s", method, path, e) + raise TransportError(f"Unexpected error: {e}") from e + + async def _prepare_headers(self) -> dict[str, str]: + """ + Prepare HTTP headers for requests. + + Returns: + Headers dictionary with authentication and tracking info + """ + auth_headers = await self._auth.get_auth_headers() + auth_headers["User-Agent"] = ( + f"speechmatics-batch-v{get_version()} python/{sys.version_info.major}.{sys.version_info.minor}" + ) + + if self._request_id: + auth_headers["X-Request-Id"] = self._request_id + + return auth_headers + + async def _handle_response(self, response: aiohttp.ClientResponse) -> aiohttp.ClientResponse: + """ + Handle HTTP response and extract JSON data. + + Args: + response: HTTP response object + + Returns: + HTTP response object + + Raises: + AuthenticationError: For 401/403 responses + TransportError: For other error responses + """ + try: + if response.status == 401: + raise AuthenticationError("Invalid API key - authentication failed") + elif response.status == 403: + raise AuthenticationError("Access forbidden - check API key permissions") + elif response.status >= 400: + error_text = await response.text() + self._logger.error("HTTP error %d %s: %s", response.status, response.reason, error_text) + raise TransportError(f"HTTP {response.status}: {response.reason} - {error_text}") + return response + + + except aiohttp.ContentTypeError as e: + self._logger.error("Failed to parse JSON response: %s", e) + raise TransportError(f"Failed to parse response: {e}") from e + except Exception as e: + self._logger.error("Error handling response: %s", e) + raise TransportError(f"Error handling response: {e}") from e From 9cf67cd47830cbf6e20ec72c12d094b7ba76a0ba Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Oct 2025 12:29:07 +0100 Subject: [PATCH 2/3] some cleanup --- examples/tts/realtime_parsing_example.py | 307 ---------------------- examples/tts/streaming_example.py | 243 ----------------- sdk/TTS/speechmatics/tts/_async_client.py | 82 +----- sdk/TTS/speechmatics/tts/_models.py | 32 ++- 4 files changed, 38 insertions(+), 626 deletions(-) delete mode 100644 examples/tts/realtime_parsing_example.py delete mode 100644 examples/tts/streaming_example.py diff --git a/examples/tts/realtime_parsing_example.py b/examples/tts/realtime_parsing_example.py deleted file mode 100644 index 644aff9..0000000 --- a/examples/tts/realtime_parsing_example.py +++ /dev/null @@ -1,307 +0,0 @@ -""" -Real-time streaming TTS with live text parsing examples. - -This demonstrates true real-time TTS where audio generation happens -simultaneously with text parsing - no waiting for complete text processing! -""" - -import asyncio -import time -from pathlib import Path -from typing import AsyncIterator - -# Assuming the real-time streaming client is available -from speechmatics.tts import RealTimeStreamingTTSClient - - -async def live_file_parsing_example(): - """Demonstrate real-time file parsing with immediate audio synthesis.""" - print("šŸ“– Real-time File Parsing Example") - print("Audio starts playing while file is still being read!") - - # Create a sample file - sample_text = """ - Welcome to real-time streaming text-to-speech synthesis. - - This is paragraph one. As you can hear, the audio begins immediately, - even though the system is still reading and parsing the rest of this document. - - This is paragraph two. The synthesis happens in real-time as each sentence - is parsed from the file. There's no waiting for the entire document to load. - - This is paragraph three. The system intelligently finds sentence boundaries - and starts synthesis as soon as it has enough text to work with. - - This is the final paragraph. By the time you hear this, the system has - been continuously reading, parsing, and synthesizing throughout the entire process. - """ - - sample_file = Path("realtime_sample.txt") - sample_file.write_text(sample_text) - - try: - async with RealTimeStreamingTTSClient( - api_key="your-api-key", - synthesis_threshold=50, # Start synthesis after 50 characters - max_parse_delay=0.05, # Very responsive parsing - ) as client: - - print("šŸŽµ Starting real-time parsing and synthesis...") - start_time = time.time() - chunk_count = 0 - - async for audio_chunk in client.stream_live_from_file( - sample_file, - voice="en-US-neural-1", - read_delay=0.03, # Simulate fast reading - ): - chunk_count += 1 - elapsed = time.time() - start_time - - print(f"šŸ”Š Audio chunk {chunk_count} at {elapsed:.2f}s: {len(audio_chunk)} bytes") - - # In real application: await play_audio_immediately(audio_chunk) - # Simulate audio playback time - await asyncio.sleep(0.1) - - total_time = time.time() - start_time - print(f"āœ… Real-time processing complete in {total_time:.2f}s with {chunk_count} chunks") - - finally: - if sample_file.exists(): - sample_file.unlink() - - -async def live_typing_simulation(): - """Simulate user typing with immediate TTS synthesis.""" - print("\nāŒØļø Live Typing Simulation") - print("Audio generated as text is typed in real-time!") - - async def simulate_typing() -> AsyncIterator[str]: - """Simulate a user typing text in real-time.""" - text = "Hello there! I am typing this text in real time. Each word appears as I type it. The speech synthesis happens immediately as I finish each sentence. This is truly real-time text-to-speech!" - - words = text.split() - for i, word in enumerate(words): - yield word + " " - - # Simulate typing speed (faster for short words, slower for long ones) - typing_delay = 0.2 + (len(word) * 0.05) - await asyncio.sleep(typing_delay) - - # Add extra pause after sentences - if word.endswith(('.', '!', '?')): - await asyncio.sleep(0.3) - - async with RealTimeStreamingTTSClient( - api_key="your-api-key", - synthesis_threshold=30, # Quick synthesis trigger - ) as client: - - print("āŒØļø Starting typing simulation...") - start_time = time.time() - - async for audio_chunk in client.stream_live_text_input( - simulate_typing(), - voice="en-US-neural-2", - speed=1.1, - ): - elapsed = time.time() - start_time - print(f"šŸŽµ Live audio at {elapsed:.2f}s: {len(audio_chunk)} bytes") - - # In real app: await play_audio_immediately(audio_chunk) - - print("āœ… Live typing synthesis complete!") - - -async def live_monitoring_example(): - """Demonstrate real-time monitoring with performance metrics.""" - print("\nšŸ“Š Real-time Monitoring Example") - print("Shows performance metrics during live processing!") - - # Create a longer sample for monitoring - long_text = """ - Real-time Performance Monitoring Demo - - """ + "\n\n".join([ - f"Section {i}: This section demonstrates real-time performance monitoring " - f"during live text-to-speech synthesis. The system tracks processing rates, " - f"audio generation speed, buffer health, and latency metrics in real-time." - for i in range(1, 8) - ]) - - monitor_file = Path("monitoring_sample.txt") - monitor_file.write_text(long_text) - - try: - async with RealTimeStreamingTTSClient(api_key="your-api-key") as client: - - print("šŸ“Š Starting monitored real-time synthesis...") - - async for result in client.stream_with_live_monitoring( - monitor_file, - voice="en-US-neural-3", - monitor_interval=0.1, - ): - audio_chunk = result["audio_chunk"] - metrics = result["metrics"] - status = result["status"] - - # Display real-time metrics - print(f"šŸŽµ Chunk {metrics['chunk_id']}: " - f"{len(audio_chunk)} bytes | " - f"Rate: {metrics['chars_per_second']:.1f} chars/s | " - f"Latency: {metrics['elapsed_time']:.2f}s | " - f"RT Ratio: {status['real_time_ratio']:.2f}") - - # In real app: await play_audio_with_monitoring(audio_chunk, metrics) - - print("āœ… Monitored synthesis complete!") - - finally: - if monitor_file.exists(): - monitor_file.unlink() - - -async def streaming_vs_batch_comparison(): - """Compare real-time streaming vs traditional batch processing.""" - print("\n⚔ Streaming vs Batch Comparison") - - test_text = """ - This is a comparison test between real-time streaming synthesis - and traditional batch processing. In batch mode, you would wait - for this entire text to be processed before hearing any audio. - In streaming mode, you hear audio immediately as text is parsed. - """ - - test_file = Path("comparison_test.txt") - test_file.write_text(test_text) - - try: - print("🐌 Simulating BATCH processing (traditional approach):") - batch_start = time.time() - - # Simulate batch processing delay - print(" ā³ Loading entire file...") - await asyncio.sleep(1.0) - print(" ā³ Processing all text...") - await asyncio.sleep(2.0) - print(" ā³ Synthesizing complete audio...") - await asyncio.sleep(3.0) - - batch_time = time.time() - batch_start - print(f" āœ… Batch complete after {batch_time:.1f}s - NOW audio starts playing") - - print("\nšŸš€ Real-time STREAMING processing:") - stream_start = time.time() - - async with RealTimeStreamingTTSClient(api_key="your-api-key") as client: - first_audio = True - - async for audio_chunk in client.stream_live_from_file( - test_file, - read_delay=0.02, # Fast parsing - ): - if first_audio: - first_audio_time = time.time() - stream_start - print(f" šŸŽµ FIRST audio at {first_audio_time:.2f}s - Audio playing while still parsing!") - first_audio = False - - # Continue processing... - - stream_total = time.time() - stream_start - print(f" āœ… Streaming complete in {stream_total:.1f}s total") - - print(f"\nšŸ“ˆ Results:") - print(f" Batch: {batch_time:.1f}s wait before ANY audio") - print(f" Streaming: {first_audio_time:.2f}s to FIRST audio ({batch_time/first_audio_time:.1f}x faster!)") - - finally: - if test_file.exists(): - test_file.unlink() - - -async def live_document_reader(): - """Simulate a live document reader with real-time TTS.""" - print("\nšŸ“š Live Document Reader Simulation") - print("Simulates reading a document with real-time speech synthesis!") - - # Simulate a document being written/received in real-time - async def live_document_stream() -> AsyncIterator[str]: - """Simulate a document being written or received live.""" - sentences = [ - "Breaking news: Real-time text-to-speech technology has reached new heights. ", - "Scientists have developed a system that can synthesize speech as text is being written. ", - "This breakthrough enables immediate audio feedback for live content creation. ", - "Applications include live transcription, real-time translation, and accessibility tools. ", - "The system processes text incrementally, starting synthesis before complete sentences are finished. ", - "This represents a significant advancement in human-computer interaction. ", - "Users can now hear their text being spoken as they type or as content arrives. ", - "The technology promises to revolutionize how we interact with text-based systems. " - ] - - for sentence in sentences: - # Simulate sentence arriving word by word - words = sentence.split() - for word in words: - yield word + " " - await asyncio.sleep(0.15) # Simulate writing/receiving speed - - # Pause between sentences - await asyncio.sleep(0.5) - - async with RealTimeStreamingTTSClient( - api_key="your-api-key", - synthesis_threshold=40, # Start synthesis quickly - ) as client: - - print("šŸ“° Starting live document reader...") - - async for audio_chunk in client.stream_live_text_input( - live_document_stream(), - voice="en-US-neural-1", - output_format="wav", - ): - print(f"šŸ”Š Live audio: {len(audio_chunk)} bytes") - # In real app: await play_live_audio(audio_chunk) - - print("āœ… Live document reading complete!") - - -async def main(): - """Run all real-time streaming examples.""" - print("šŸš€ Real-time Streaming TTS Examples") - print("=" * 60) - print("šŸŽÆ Audio synthesis happens WHILE text is being parsed!") - print("=" * 60) - - examples = [ - live_file_parsing_example, - live_typing_simulation, - live_monitoring_example, - streaming_vs_batch_comparison, - live_document_reader, - ] - - for example in examples: - try: - await example() - except Exception as e: - print(f"āŒ Example failed: {e}") - - print("\n" + "-" * 40 + "\n") - await asyncio.sleep(1) - - print("šŸŽ‰ All real-time streaming examples completed!") - print("\nšŸ’” Key Benefits Demonstrated:") - print(" • Audio starts immediately (no waiting for complete parsing)") - print(" • Continuous synthesis during text processing") - print(" • Real-time performance monitoring") - print(" • Live input processing (typing, streaming content)") - print(" • Intelligent boundary detection for optimal synthesis") - - -if __name__ == "__main__": - print("āš ļø Remember to set your actual Speechmatics API key!") - print("šŸŽ§ In real usage, replace print statements with actual audio playback!") - asyncio.run(main()) diff --git a/examples/tts/streaming_example.py b/examples/tts/streaming_example.py deleted file mode 100644 index 9ab9cd1..0000000 --- a/examples/tts/streaming_example.py +++ /dev/null @@ -1,243 +0,0 @@ -""" -Example usage of the Streaming TTS Client. - -This example demonstrates real-time text-to-speech streaming with chunked processing, -WebSocket communication, and incremental audio delivery. -""" - -import asyncio -import io -from pathlib import Path - -# Assuming the streaming client is available -from speechmatics.tts import StreamingTTSClient - - -async def basic_streaming_example(): - """Basic streaming TTS example.""" - print("šŸŽµ Basic Streaming TTS Example") - - text = """ - This is a demonstration of streaming text-to-speech synthesis. - The text is being processed in real-time, chunk by chunk. - You should hear audio being generated as each segment is processed. - This enables immediate playback without waiting for the entire text to be synthesized. - """ - - async with StreamingTTSClient(api_key="your-api-key") as client: - print("šŸ“ Starting streaming synthesis...") - - audio_chunks = [] - async for audio_chunk in client.stream_speech( - text, - voice="en-US-neural-1", - output_format="wav", - sample_rate=22050 - ): - print(f"šŸ”Š Received audio chunk: {len(audio_chunk)} bytes") - audio_chunks.append(audio_chunk) - - # In a real application, you would play this chunk immediately - # await play_audio_chunk(audio_chunk) - - print(f"āœ… Streaming complete! Total chunks: {len(audio_chunks)}") - - # Save complete audio - with open("streaming_output.wav", "wb") as f: - for chunk in audio_chunks: - f.write(chunk) - - -async def file_streaming_example(): - """Stream TTS from a text file.""" - print("\nšŸ“„ File Streaming TTS Example") - - # Create a sample text file - sample_text = """ - Chapter 1: The Beginning - - In the realm of streaming text-to-speech, we embark on a journey of real-time audio generation. - Each sentence flows seamlessly into the next, creating a continuous stream of synthesized speech. - - The technology behind this process involves breaking down large texts into manageable chunks, - processing them through advanced neural networks, and delivering audio incrementally. - - This approach enables immediate playback, reduced memory usage, and better user experience - for applications dealing with long-form content like audiobooks, articles, and documents. - """ - - # Write sample file - sample_file = Path("sample_text.txt") - sample_file.write_text(sample_text) - - try: - async with StreamingTTSClient(api_key="your-api-key") as client: - print(f"šŸ“– Streaming from file: {sample_file}") - - chunk_count = 0 - async for audio_chunk in client.stream_from_file( - sample_file, - voice="en-US-neural-2", - output_format="mp3", - speed=1.1 - ): - chunk_count += 1 - print(f"šŸŽµ File chunk {chunk_count}: {len(audio_chunk)} bytes") - - # In real usage: await play_audio_chunk(audio_chunk) - - print(f"āœ… File streaming complete! Processed {chunk_count} chunks") - - finally: - # Cleanup - if sample_file.exists(): - sample_file.unlink() - - -async def incremental_file_streaming_example(): - """Stream very large files with incremental reading.""" - print("\nšŸ“š Incremental File Streaming Example") - - # Create a larger sample file - large_text = """ - The Art of Streaming Text-to-Speech - - """ + "\n\n".join([ - f"Paragraph {i}: This is a demonstration of incremental file processing. " - f"The system reads the file in chunks and processes them as they become available. " - f"This approach is particularly useful for very large documents, books, or articles " - f"where loading the entire content into memory might not be feasible." - for i in range(1, 21) - ]) - - large_file = Path("large_sample.txt") - large_file.write_text(large_text) - - try: - async with StreamingTTSClient( - api_key="your-api-key", - chunk_size=300, # Smaller chunks for demo - overlap_size=30 - ) as client: - print(f"šŸ“– Incremental streaming from: {large_file}") - - chunk_count = 0 - total_bytes = 0 - - async for audio_chunk in client.stream_from_file_incremental( - large_file, - voice="en-US-neural-3", - output_format="wav", - read_chunk_size=1024 # Read 1KB at a time - ): - chunk_count += 1 - total_bytes += len(audio_chunk) - print(f"šŸŽµ Incremental chunk {chunk_count}: {len(audio_chunk)} bytes " - f"(Total: {total_bytes} bytes)") - - # Simulate real-time playback delay - await asyncio.sleep(0.1) - - print(f"āœ… Incremental streaming complete! " - f"Chunks: {chunk_count}, Total audio: {total_bytes} bytes") - - finally: - # Cleanup - if large_file.exists(): - large_file.unlink() - - -async def real_time_playback_simulation(): - """Simulate real-time audio playback with streaming.""" - print("\nšŸŽ® Real-time Playback Simulation") - - text = """ - Welcome to the real-time streaming demonstration. - This example simulates how you would integrate streaming TTS - with an audio playback system for immediate user experience. - """ - - class AudioPlayer: - """Simulated audio player.""" - - def __init__(self): - self.buffer = io.BytesIO() - self.playing = False - - async def add_chunk(self, audio_chunk: bytes): - """Add audio chunk to playback buffer.""" - self.buffer.write(audio_chunk) - if not self.playing: - await self.start_playback() - - async def start_playback(self): - """Start audio playback (simulated).""" - self.playing = True - print("šŸ”Š Audio playback started...") - - # Simulate playback time - await asyncio.sleep(1.0) - - print("šŸ”‡ Audio playback finished") - self.playing = False - - player = AudioPlayer() - - async with StreamingTTSClient(api_key="your-api-key") as client: - print("šŸŽµ Starting real-time synthesis and playback...") - - # Process streaming TTS and playback concurrently - async for audio_chunk in client.stream_speech( - text, - voice="en-US-neural-1", - output_format="wav" - ): - print(f"šŸ“¦ Received chunk: {len(audio_chunk)} bytes") - await player.add_chunk(audio_chunk) - - print("āœ… Real-time streaming and playback complete!") - - -async def error_handling_example(): - """Demonstrate error handling in streaming TTS.""" - print("\nāš ļø Error Handling Example") - - async with StreamingTTSClient(api_key="invalid-key") as client: - try: - async for audio_chunk in client.stream_speech("Test text"): - print(f"Chunk: {len(audio_chunk)} bytes") - - except Exception as e: - print(f"āŒ Error occurred: {type(e).__name__}: {e}") - print("šŸ”§ In production, implement proper error recovery") - - -async def main(): - """Run all streaming TTS examples.""" - print("šŸš€ Speechmatics Streaming TTS Examples") - print("=" * 50) - - examples = [ - basic_streaming_example, - file_streaming_example, - incremental_file_streaming_example, - real_time_playback_simulation, - error_handling_example, - ] - - for example in examples: - try: - await example() - except Exception as e: - print(f"āŒ Example failed: {e}") - - print("\n" + "-" * 30 + "\n") - await asyncio.sleep(1) # Brief pause between examples - - print("šŸŽ‰ All examples completed!") - - -if __name__ == "__main__": - # Note: Replace "your-api-key" with actual Speechmatics API key - print("āš ļø Remember to set your actual Speechmatics API key!") - asyncio.run(main()) diff --git a/sdk/TTS/speechmatics/tts/_async_client.py b/sdk/TTS/speechmatics/tts/_async_client.py index 46b8106..d887214 100644 --- a/sdk/TTS/speechmatics/tts/_async_client.py +++ b/sdk/TTS/speechmatics/tts/_async_client.py @@ -10,7 +10,7 @@ import asyncio import os import uuid -from typing import Any +from typing import Any, AsyncGenerator from typing import BinaryIO from typing import Optional from typing import Union @@ -22,7 +22,7 @@ from ._exceptions import AuthenticationError from ._exceptions import TimeoutError from ._logging import get_logger -from ._models import ConnectionConfig +from ._models import ConnectionConfig, OutputFormat, Voice from ._transport import Transport @@ -112,14 +112,12 @@ async def __aenter__(self) -> AsyncClient: """ return self - async def synthesize_speech( + async def generate( self, - text: str, *, - voice: Optional[str] = None, - output_format: str = "wav", - sample_rate: Optional[int] = None, - speed: Optional[float] = None, + text: str = "", + voice: Voice = Voice.SARAH, + output_format: OutputFormat = OutputFormat.RAW_PCM_16000, ) -> aiohttp.ClientResponse: """ Convert text to speech audio. @@ -128,8 +126,6 @@ async def synthesize_speech( text: Text to convert to speech. voice: Voice ID to use for synthesis (e.g., "en-US-neural-1"). output_format: Audio format ("wav", "mp3", "ogg"). - sample_rate: Audio sample rate in Hz (e.g., 22050, 44100). - speed: Speech speed multiplier (0.5 to 2.0). Returns: Audio data as bytes. @@ -139,7 +135,7 @@ async def synthesize_speech( TransportError: If synthesis fails. Examples: - >>> response = await client.synthesize_speech("Hello world") + >>> response = await client.generate("Hello world") >>> audio_data = await response.read() >>> with open("output.wav", "wb") as f: ... f.write(audio_data) @@ -147,71 +143,11 @@ async def synthesize_speech( # Prepare synthesis request request_data = { "text": text, - "output_format": output_format, } - - if voice: - request_data["voice"] = voice - if sample_rate: - request_data["sample_rate"] = str(sample_rate) - if speed: - request_data["speed"] = str(speed) - - response = await self._transport.post("/synthesize", json_data=request_data) - return response - - async def synthesize_from_file( - self, - file_path: Union[str, os.PathLike], - *, - voice: Optional[str] = None, - output_format: str = "wav", - sample_rate: Optional[int] = None, - speed: Optional[float] = None, - ) -> aiohttp.ClientResponse: - """ - Convert text from a file to speech audio. - - Args: - file_path: Path to text or SSML file. - voice: Voice ID to use for synthesis. - output_format: Audio format ("wav", "mp3", "ogg"). - sample_rate: Audio sample rate in Hz. - speed: Speech speed multiplier (0.5 to 2.0). - - Returns: - Raw aiohttp ClientResponse object. - Raises: - FileNotFoundError: If file doesn't exist. - AuthenticationError: If API key is invalid. - TransportError: If synthesis fails. + response = await self._transport.post(f"/generate/{voice}?output_format={output_format}", json_data=request_data) + return response - Examples: - >>> response = await client.synthesize_from_file("script.txt") - >>> audio_data = await response.read() - >>> with open("output.wav", "wb") as f: - ... f.write(audio_data) - """ - import aiofiles - from pathlib import Path - - file_path_obj = Path(file_path) - if not file_path_obj.exists(): - raise FileNotFoundError(f"File not found: {file_path}") - - # Read text content - async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: - text_content = await f.read() - - return await self.synthesize_speech( - text_content, - voice=voice, - output_format=output_format, - sample_rate=sample_rate, - speed=speed, - ) - async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: """ Async context manager exit with automatic cleanup. diff --git a/sdk/TTS/speechmatics/tts/_models.py b/sdk/TTS/speechmatics/tts/_models.py index 71c6bcb..7162913 100644 --- a/sdk/TTS/speechmatics/tts/_models.py +++ b/sdk/TTS/speechmatics/tts/_models.py @@ -9,12 +9,10 @@ from __future__ import annotations +from enum import Enum from dataclasses import dataclass - - - @dataclass class ConnectionConfig: """ @@ -29,3 +27,31 @@ class ConnectionConfig: connect_timeout: float = 30.0 operation_timeout: float = 300.0 + + +class OutputFormat(str, Enum): + """ + Output format for the generated audio. + + Attributes: + wav_16000: WAV audio format with 16kHz sample rate. + raw_pcm_16000: Raw audio format with 16kHz sample rate. + """ + + WAV_16000 = "wav_16000" + RAW_PCM_16000 = "pcm_16000" + + +class Voice(str, Enum): + """ + Voice ID for the generated audio. + + Attributes: + sarah: English (UK) female voice. + theo: English (UK) male voice. + megan: English (UK) female voice. + """ + + SARAH = "sarah" + THEO = "theo" + MEGAN = "megan" From c9c150d90cef0e0a938ced4e3d4073162eafb8ee Mon Sep 17 00:00:00 2001 From: Lucy Gavin Date: Tue, 21 Oct 2025 15:00:20 +0100 Subject: [PATCH 3/3] async client test --- Makefile | 45 +++++++++++----- sdk/TTS/MIGRATION.md | 64 ----------------------- sdk/TTS/speechmatics/__init__.py | 2 + sdk/TTS/speechmatics/tts/__init__.py | 34 ++---------- sdk/TTS/speechmatics/tts/_async_client.py | 6 +-- tests/tts/async_http_test.py | 9 ++++ 6 files changed, 51 insertions(+), 109 deletions(-) delete mode 100644 sdk/TTS/MIGRATION.md create mode 100644 tests/tts/async_http_test.py diff --git a/Makefile b/Makefile index 575f351..85a24ad 100644 --- a/Makefile +++ b/Makefile @@ -1,12 +1,12 @@ # Makefile for Speechmatics Python SDKs .PHONY: help -.PHONY: test-all test-rt test-batch test-flow -.PHONY: format-all format-rt format-batch format-flow -.PHONY: lint-all lint-rt lint-batch lint-flow -.PHONY: type-check-all type-check-rt type-check-batch type-check-flow -.PHONY: build-all build-rt build-batch build-flow -.PHONY: clean-all clean-rt clean-batch clean-flow clean-flow +.PHONY: test-all test-rt test-batch test-flow test-tts +.PHONY: format-all format-rt format-batch format-flow format-tts +.PHONY: lint-all lint-rt lint-batch lint-flow lint-tts +.PHONY: type-check-all type-check-rt type-check-batch type-check-flow type-check-tts +.PHONY: build-all build-rt build-batch build-flow build-tts +.PHONY: clean-all clean-rt clean-batch clean-flow clean-tts help: @echo "Available commands:" @@ -49,7 +49,7 @@ help: @echo "" # Testing targets -test-all: test-rt test-batch test-flow +test-all: test-rt test-batch test-flow test-tts test-rt: pytest tests/rt/ -v @@ -60,8 +60,11 @@ test-batch: test-flow: pytest tests/flow/ -v +test-tts: + pytest tests/tts/ -v + # Formatting targets -format-all: format-rt format-batch format-flow +format-all: format-rt format-batch format-flow format-tts format-rt: cd sdk/rt/speechmatics && black . @@ -75,8 +78,12 @@ format-flow: cd sdk/flow/speechmatics && black . cd sdk/flow/speechmatics && ruff check --fix . +format-tts: + cd sdk/tts/speechmatics && black . + cd sdk/tts/speechmatics && ruff check --fix . + # Linting targets -lint-all: lint-rt lint-batch lint-flow +lint-all: lint-rt lint-batch lint-flow lint-tts lint-rt: cd sdk/rt/speechmatics && ruff check . @@ -87,8 +94,11 @@ lint-batch: lint-flow: cd sdk/flow/speechmatics && ruff check . +lint-tts: + cd sdk/tts/speechmatics && ruff check . + # Type checking targets -type-check-all: type-check-rt type-check-batch type-check-flow +type-check-all: type-check-rt type-check-batch type-check-flow type-check-tts type-check-rt: cd sdk/rt/speechmatics && mypy . @@ -99,18 +109,22 @@ type-check-batch: type-check-flow: cd sdk/flow/speechmatics && mypy . +type-check-tts: + cd sdk/tts/speechmatics && mypy . + # Installation targets install-dev: python -m pip install --upgrade pip python -m pip install -e sdk/rt[dev] python -m pip install -e sdk/batch[dev] python -m pip install -e sdk/flow[dev] + python -m pip install -e sdk/tts[dev] install-build: python -m pip install --upgrade build # Building targets -build-all: build-rt build-batch build-flow +build-all: build-rt build-batch build-flow build-tts build-rt: install-build cd sdk/rt && python -m build @@ -121,8 +135,11 @@ build-batch: install-build build-flow: install-build cd sdk/flow && python -m build +build-tts: install-build + cd sdk/tts && python -m build + # Cleaning targets -clean-all: clean-rt clean-batch clean-flow +clean-all: clean-rt clean-batch clean-flow clean-tts clean-rt: rm -rf sdk/rt/dist sdk/rt/build sdk/rt/*.egg-info @@ -135,3 +152,7 @@ clean-batch: clean-flow: rm -rf sdk/flow/dist sdk/flow/build sdk/flow/*.egg-info find sdk/flow -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + +clean-tts: + rm -rf sdk/tts/dist sdk/tts/build sdk/tts/*.egg-info + find sdk/tts -name __pycache__ -exec rm -rf {} + 2>/dev/null || true diff --git a/sdk/TTS/MIGRATION.md b/sdk/TTS/MIGRATION.md deleted file mode 100644 index abedf51..0000000 --- a/sdk/TTS/MIGRATION.md +++ /dev/null @@ -1,64 +0,0 @@ -# Batch SDK Migration Guide - -This guide helps users migrate from the legacy Speechmatics Batch Client (`speechmatics-python`) to the new Speechmatics Batch SDK (`speechmatics-batch`). The new SDK provides a modern async API, improved error handling, and enhanced authentication options with minimal dependencies. - -## Significant Changes - -- **Fully async API**: All operations now use async/await pattern with `AsyncClient` -- **Enhanced authentication**: Support for both API key and JWT authentication methods -- **Better error handling**: More specific exceptions and clearer error messages -- **Lightweight package**: Minimal dependencies for faster installation and reduced conflicts -- **Improved job management**: Better job status tracking and result handling -- **Streamlined configuration**: Unified `JobConfig` for all job types -- **URL and API key configuration**: Allows loading URL and API key from environment variables - -### Breaking Changes - -- **Import paths**: `speechmatics.batch_client` → `speechmatics.batch` -- **Client class**: `BatchClient` → `AsyncClient` -- **All methods**: Synchronous → Asynchronous (requires `await`) -- **Configuration**: `BatchTranscriptionConfig` → `JobConfig` with `TranscriptionConfig` -- **Job submission**: Direct config parameter → Structured `JobConfig` object -- **Result format**: `transcription_format` parameter → `format_type` parameter -- **Authentication**: API key parameter naming changed to `api_key` -- **CLI not available**: CLI will be released as a separate package - -## Installation - -``` bash -pip install speechmatics-batch -``` - -## Usage - -Before - -```python -from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig -from speechmatics.batch_client import BatchClient - -with BatchClient("API-KEY") as client: - job_id = client.submit_job(PATH_TO_FILE, BatchTranscriptionConfig(LANGUAGE)) - - transcript = client.wait_for_completion(job_id, transcription_format='txt') - - print(transcript) -``` - -After - -```python -from speechmatics.batch import AsyncClient, FormatType, JobConfig, JobType, TranscriptionConfig - -async with AsyncClient(os.environ.get("SPEECHMATICS_API_KEY")) as client: - config = JobConfig( - type=JobType.TRANSCRIPTION, - transcription_config=TranscriptionConfig(language="en"), - ) - - job = await client.submit_job("audio.wav", config=config) - - result = await client.wait_for_completion(job.id, format_type=FormatType.TXT) - - print(f"Transcript: {result.transcript_text}") -``` diff --git a/sdk/TTS/speechmatics/__init__.py b/sdk/TTS/speechmatics/__init__.py index e69de29..edef4e7 100644 --- a/sdk/TTS/speechmatics/__init__.py +++ b/sdk/TTS/speechmatics/__init__.py @@ -0,0 +1,2 @@ +# Re-export the tts module contents for easier imports +from .tts import * # noqa: F403, F401 \ No newline at end of file diff --git a/sdk/TTS/speechmatics/tts/__init__.py b/sdk/TTS/speechmatics/tts/__init__.py index f488880..f069fdf 100644 --- a/sdk/TTS/speechmatics/tts/__init__.py +++ b/sdk/TTS/speechmatics/tts/__init__.py @@ -12,21 +12,8 @@ from ._exceptions import TimeoutError from ._exceptions import TransportError from ._models import ConnectionConfig -from ._models import FetchData -from ._models import FormatType -from ._models import JobConfig -from ._models import JobDetails -from ._models import JobInfo -from ._models import JobStatus -from ._models import JobType -from ._models import NotificationConfig -from ._models import NotificationContents -from ._models import NotificationMethod -from ._models import OperatingPoint -from ._models import SummarizationConfig -from ._models import Transcript -from ._models import TranscriptionConfig -from ._models import TranslationConfig +from ._models import Voice +from ._models import OutputFormat __all__ = [ "AsyncClient", @@ -40,20 +27,7 @@ "BatchError", "JobError", "TimeoutError", - "JobConfig", - "JobDetails", - "JobInfo", - "NotificationConfig", - "NotificationMethod", - "NotificationContents", - "OperatingPoint", - "SummarizationConfig", - "Transcript", - "TranscriptionConfig", - "TranslationConfig", "ConnectionConfig", - "JobStatus", - "JobType", - "FormatType", - "FetchData", + "Voice", + "OutputFormat", ] diff --git a/sdk/TTS/speechmatics/tts/_async_client.py b/sdk/TTS/speechmatics/tts/_async_client.py index d887214..77b8103 100644 --- a/sdk/TTS/speechmatics/tts/_async_client.py +++ b/sdk/TTS/speechmatics/tts/_async_client.py @@ -91,7 +91,7 @@ def __init__( ConfigurationError: If auth is None and API key is not provided/found. """ self._auth = auth or StaticKeyAuth(api_key) - self._url = url or os.environ.get("SPEECHMATICS_TTS_URL") or "https://tts.api.speechmatics.com/v2" + self._url = url or os.environ.get("SPEECHMATICS_TTS_URL") or "https://preview.tts.speechmatics.com" self._conn_config = conn_config or ConnectionConfig() self._request_id = str(uuid.uuid4()) self._transport = Transport(self._url, self._conn_config, self._auth, self._request_id) @@ -145,7 +145,7 @@ async def generate( "text": text, } - response = await self._transport.post(f"/generate/{voice}?output_format={output_format}", json_data=request_data) + response = await self._transport.post(f"/generate/{voice.value}?output_format={output_format.value}", json_data=request_data) return response async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: @@ -175,7 +175,7 @@ async def close(self) -> None: Examples: >>> client = AsyncClient(api_key="key") >>> try: - ... result = await client.transcribe("audio.wav") + ... result = await client.generate(text="Hello world") >>> finally: ... await client.close() """ diff --git a/tests/tts/async_http_test.py b/tests/tts/async_http_test.py new file mode 100644 index 0000000..2b53a06 --- /dev/null +++ b/tests/tts/async_http_test.py @@ -0,0 +1,9 @@ +import pytest +from speechmatics.tts import AsyncClient + +@pytest.mark.asyncio +async def test_async_http(): + async with AsyncClient() as client: + response = await client.generate(text="Hello world") + assert response.status == 200 + \ No newline at end of file