Skip to content

Commit 750dafd

Browse files
authored
WMS ID 9001: Update AI Speech livelabs with Live Transcribe multilingual language changes (#425)
* WMS ID 9001: Update AI Speech livelabs with Live Transcribe multilingual language changes * Fix screenshots and formatting
1 parent da94797 commit 750dafd

File tree

10 files changed

+236
-31
lines changed

10 files changed

+236
-31
lines changed

oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example.py renamed to oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example_oracle.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def message_callback(message):
124124
realtime_speech_parameters.model_domain = (
125125
realtime_speech_parameters.MODEL_DOMAIN_GENERIC
126126
)
127+
realtime_speech_parameters.model_type = "ORACLE"
127128
realtime_speech_parameters.partial_silence_threshold_in_ms = 0
128129
realtime_speech_parameters.final_silence_threshold_in_ms = 2000
129130
realtime_speech_parameters.encoding="audio/raw;rate=16000"
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
import asyncio
2+
import pyaudio
3+
from collections import deque
4+
import oci
5+
from oci.config import from_file
6+
from oci.auth.signers.security_token_signer import SecurityTokenSigner
7+
8+
from oci_ai_speech_realtime import (
9+
RealtimeSpeechClient,
10+
RealtimeSpeechClientListener
11+
)
12+
13+
from oci.ai_speech.models import (
14+
RealtimeParameters,
15+
)
16+
17+
import logging
18+
19+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20+
# Create a logger for this module
21+
logger = logging.getLogger(__name__)
22+
23+
# This is needed for wss connections
24+
# import truststore
25+
# truststore.inject_into_ssl()
26+
27+
# Create a FIFO queue
28+
queue = asyncio.Queue()
29+
30+
# Set audio parameters
31+
SAMPLE_RATE = 16000
32+
FORMAT = pyaudio.paInt16
33+
CHANNELS = 1
34+
BUFFER_DURATION_MS = 96
35+
36+
# Calculate the number of frames per buffer
37+
FRAMES_PER_BUFFER = int(SAMPLE_RATE * BUFFER_DURATION_MS / 1000)
38+
39+
40+
def authenticator():
41+
config =oci.config.from_file()
42+
with open(config["security_token_file"], "r") as f:
43+
token = f.readline()
44+
45+
private_key = oci.signer.load_private_key_from_file(config["key_file"])
46+
47+
return SecurityTokenSigner(token=token, private_key=private_key)
48+
49+
50+
def audio_callback(in_data, frame_count, time_info, status):
51+
# This function will be called by PyAudio when there's new audio data
52+
queue.put_nowait(in_data)
53+
return (None, pyaudio.paContinue)
54+
55+
56+
# Create a PyAudio object
57+
p = pyaudio.PyAudio()
58+
59+
# Open the stream
60+
stream = p.open(
61+
format=FORMAT,
62+
channels=CHANNELS,
63+
rate=SAMPLE_RATE,
64+
input=True,
65+
frames_per_buffer=FRAMES_PER_BUFFER,
66+
stream_callback=audio_callback,
67+
)
68+
69+
stream.start_stream()
70+
config = from_file()
71+
72+
73+
async def send_audio(client):
74+
i = 0
75+
# loop = asyncio.get_running_loop()
76+
while True:
77+
data = await queue.get()
78+
79+
# Send it over the websocket
80+
await client.send_data(data)
81+
i += 1
82+
83+
# stream.close()
84+
# client.close()
85+
86+
87+
class MyListener(RealtimeSpeechClientListener):
88+
def on_result(self, result):
89+
if result["transcriptions"][0]["isFinal"]:
90+
logger.info(
91+
f"Received final results: {result['transcriptions'][0]['transcription']}"
92+
)
93+
# partial results are currently supported by ORACLE model only
94+
# else:
95+
# logger.info(
96+
# f"Received partial results: {result['transcriptions'][0]['transcription']}"
97+
# )
98+
99+
def on_ack_message(self, ackmessage):
100+
return super().on_ack_message(ackmessage)
101+
102+
def on_connect(self):
103+
return super().on_connect()
104+
105+
def on_connect_message(self, connectmessage):
106+
return super().on_connect_message(connectmessage)
107+
108+
def on_network_event(self, ackmessage):
109+
return super().on_network_event(ackmessage)
110+
111+
def on_error(self):
112+
return super().on_error()
113+
114+
def on_close(self, error_code, error_message):
115+
return super().on_close(error_code, error_message)
116+
117+
118+
if __name__ == "__main__":
119+
# Run the event loop
120+
def message_callback(message):
121+
logger.info(f"Received message: {message}")
122+
123+
realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
124+
realtime_speech_parameters.language_code = "en"
125+
realtime_speech_parameters.model_domain = (
126+
realtime_speech_parameters.MODEL_DOMAIN_GENERIC
127+
)
128+
realtime_speech_parameters.model_type = "WHISPER"
129+
realtime_speech_parameters.encoding="audio/raw;rate=16000"
130+
realtime_speech_parameters.punctuation = (
131+
realtime_speech_parameters.PUNCTUATION_AUTO
132+
)
133+
134+
realtime_speech_url = "wss://realtime.aiservice.uk-london-1.oci.oraclecloud.com"
135+
client = RealtimeSpeechClient(
136+
realtime_speech_parameters=realtime_speech_parameters,
137+
config=config,
138+
listener=MyListener(),
139+
service_endpoint=realtime_speech_url,
140+
signer=authenticator(),
141+
compartment_id="<compartmentID>",
142+
)
143+
144+
loop = asyncio.get_event_loop()
145+
loop.create_task(send_audio(client))
146+
loop.run_until_complete(client.connect())
147+
148+
"""
149+
Optionally request the final result on demand.
150+
The below code snippet will request a final result.
151+
152+
await client.request_final_result()
153+
154+
"""
155+
156+
if stream.is_active():
157+
stream.close()
158+
159+
logger.info("Closed now")
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
PyAudio>=0.2.14
2-
oci-ai-speech-realtime>=2.1.0
2+
oci-ai-speech-realtime>=2.2.0
681 KB
Loading
163 KB
Loading
179 KB
Loading
283 KB
Loading
286 KB
Loading
343 KB
Loading

oci-artificial-intelligence/ai-speech/transcribe-live-audio/transcribe-live-audio.md

Lines changed: 75 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,28 @@
33
## Introduction
44
In this session, we will help users get familiar with OCI Speech live transcribe and teach them how to use our services via the cloud console.
55

6-
***Estimated Lab Time***: 5 minutes
6+
***Estimated Lab Time***: 30 minutes
77

88
### Objectives
99

1010
In this lab, you will:
1111
- Learn how to transcribe live audio to text from the OCI Console
1212
- Invoke custom vocabulary (customizations) in the OCI Console
13+
- Learn how to use OCI AI Speech Realtime Python SDK to create live transcription sessions
1314

1415
### Prerequisites:
1516
- A Free tier or paid tenancy account in OCI (Oracle Cloud Infrastructure)
1617
- Tenancy is whitelisted to be able to use OCI Speech
1718

1819
## Task 1: Navigate to Overview Page
1920

20-
Log into OCI Cloud Console. Using the Burger Menu on the top left corner, navigate to Analytics and AI menu and click it, and then select Language item under AI services.
21+
Log into OCI Console. Using the Burger Menu on the top left corner, navigate to **Analytics and AI** menu, and then select **Speech** under **AI Services**.
2122
![Navigate speech service menu](./images/navigate-to-ai-speech-menu.png " ")
2223

23-
This will navigate you to the transcription jobs overview page.
24-
On the left you can toggle between overview and transcription jobs listing page.
25-
Under documentation you can find helpful links relevant to OCI speech service
24+
This will navigate you to the Speech overview page.
25+
From the left you can navigate to various OCI Speech offerings.
26+
27+
From the Documentation section you can find helpful links relevant to OCI Speech service
2628
![Speech service overview page](./images/overview-page.png " ")
2729

2830

@@ -46,25 +48,34 @@ Under documentation you can find helpful links relevant to OCI speech service
4648

4749
To change transcription parameters, look to the <strong>Configure transcription</strong> menu to the right
4850

49-
1. Configure transcription
51+
### Configure transcription
52+
53+
Here you can change parameters such as transcription model type, model domain, audio language, punctuation, partial and final silence thresholds, partial results stability and enable customizations
54+
![Configure transcription](./images/configure-transcription.png " ")
5055

51-
Here you can change parameters such as transcription model type, audio language, punctuation, partial and final silence thresholds partial results stability and enable customizations
52-
![Configure transcription](./images/configure-transcription.png " ")
56+
- <strong>Model type:</strong> Use this parameter to select a model to use for generating transcriptions. Currently supported model types are: `ORACLE` and `WHISPER`
57+
58+
> Note: Partial results are only supported by `ORACLE` model
59+
60+
- <strong>Model domain:</strong> Use this parameter to configure the transcription model for specialized audio, e.g. audio that features specific media terminology. Currently supported model domains are: `GENERIC` and `MEDICAL`
5361

54-
<strong>Choose domain:</strong> Use this parameter to configure the transcription model for specialized audio, e.g. audio that features specific medial terminology
62+
> Note: `MEDICAL` domain is only supported by `ORACLE` model
5563
56-
<strong>Choose language:</strong> Use this parameter to configure the language of the speaker
64+
- <strong>Language:</strong> Use this parameter to configure the trancription language. `WHISPER` model supports automatic language detection.
5765

58-
<strong>Choose punctuation:</strong> Use this parameter to configure the punctuation mode for the transcription model
66+
- <strong>Punctuation:</strong> Use this parameter to configure the punctuation mode for the transcription model. Currently supported punctuation modes are: `NONE`, `AUTO` and `SPOKEN`
5967

60-
<strong>Partial silence threshold:</strong> Use this parameter to configure how quickly partial results should be
61-
returned
68+
> Note: Punctuation mode `SPOKEN` is only supported by `MEDICAL` domain
69+
70+
Following parameters are only supported by `ORACLE` model:
71+
72+
- <strong>Partial silence threshold:</strong> Use this parameter to configure how quickly partial results should be returned. Value ranges from `0` to `2000` milliseconds.
6273

63-
<strong>Final silence threshold:</strong> Use this parameter to configure how long to wait before a partial result is finalized
74+
- <strong>Final silence threshold:</strong> Use this parameter to configure how long to wait before a partial result is finalized. Value ranges from `0` to `5000` milliseconds.
6475

65-
<strong>Partial results stability:</strong> Use this parameter to configure the stability of partial results (amount of confidence required before returning a partial result)
76+
- <strong>Partial results stability:</strong> Use this parameter to configure the stability of partial results (amount of confidence required before returning a partial result). Allowed values are `NONE`, `LOW`, `MEDIUM` and `HIGH`.
6677

67-
<strong>Enable customizations:</strong> Check this box to choose a customization to use during your transcription session
78+
- <strong>Enable customizations:</strong> Check this box to choose a customization to use during your transcription session.
6879

6980
## Task 4: Enabling a customization
7081

@@ -99,9 +110,10 @@ Alternatively, both the required packages can be installed using the `requiremen
99110
pip install -r requirements.txt
100111
```
101112

113+
### Python example:
114+
102115
OCI AI Speech live transcription uses websockets to relay audio data and receive text transcriptions in real time. This means your client must implement some key listener functions:
103116

104-
<strong>Python example:</strong>
105117
```
106118
on_result(result)
107119
// This function will be called whenever a result is returned from the
@@ -133,7 +145,7 @@ on_close(error_code, error_message) (optional)
133145
// Its implementation is not required
134146
```
135147

136-
Example implementation of listener functions:
148+
**Example implementation of listener functions:**
137149
```
138150
class MyRealtimeListener(RealtimeClientListener):
139151
result = []
@@ -189,31 +201,34 @@ class MyRealtimeListener(RealtimeClientListener):
189201

190202
<strong>Realtime client parameters</strong> can be set and included in your realtime client to change the behavior of your transcription session.
191203

192-
*example values:*
204+
### Sample values for model type `ORACLE`:
193205

194-
`language_code` : <strong>"en-US"</strong>
206+
- `language_code` : <strong>"en-US"</strong>
195207

196-
`model_domain` : <strong>"GENERIC"</strong>
208+
- `model_type` : <strong>"ORACLE"</strong>
197209

198-
`partial_silence_threshold_in_ms` : <strong>0</strong>
210+
- `model_domain` : <strong>"GENERIC"</strong>
199211

200-
`final_silence_threshold_in_ms` : <strong>2000</strong>
212+
- `partial_silence_threshold_in_ms` : <strong>0</strong>
201213

202-
`encoding` : <strong>"audio/raw;rate=16000"</strong>
214+
- `final_silence_threshold_in_ms` : <strong>2000</strong>
203215

204-
`punctuation` : <strong>AUTO</strong>
216+
- `encoding` : <strong>"audio/raw;rate=16000"</strong>
205217

206-
`should_ignore_invalid_customizations` : <strong>True</strong>
218+
- `punctuation` : <strong>AUTO</strong>
207219

208-
`stabilize_partial_results` : <strong>True</strong>
220+
- `should_ignore_invalid_customizations` : <strong>True</strong>
209221

210-
`customizations` : <strong>[Customization1]</strong>
222+
- `stabilize_partial_results` : <strong>True</strong>
211223

212-
<strong>Example of setting realtime parameters</strong>
224+
- `customizations` : <strong>[Customization1]</strong>
225+
226+
<strong>Sample implementation of setting realtime parameters for model type `ORACLE`:</strong>
213227

214228
```
215229
realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
216230
realtime_speech_parameters.language_code = "en-US"
231+
realtime_speech_parameters.model_type = "ORACLE"
217232
realtime_speech_parameters.model_domain = (
218233
realtime_speech_parameters.MODEL_DOMAIN_GENERIC
219234
)
@@ -244,7 +259,37 @@ realtime_speech_parameters.customizations = [
244259
]
245260
```
246261

247-
Download a fully implemented python example [here.](./files/realtime_example.py)
262+
Download a fully implemented python example [here.](./files/realtime_example_oracle.py)
263+
264+
### Sample values for model type `WHISPER`:
265+
266+
- `language_code` : <strong>"en"</strong>
267+
268+
- `model_type` : <strong>"WHISPER"</strong>
269+
270+
- `model_domain` : <strong>"GENERIC"</strong>
271+
272+
- `encoding` : <strong>"audio/raw;rate=16000"</strong>
273+
274+
- `punctuation` : <strong>AUTO</strong>
275+
276+
<strong>Sample implementation of setting realtime parameters for model type `WHISPER`:</strong>
277+
278+
```
279+
realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
280+
realtime_speech_parameters.language_code = "en"
281+
realtime_speech_parameters.model_type = "WHISPER"
282+
realtime_speech_parameters.model_domain = (
283+
realtime_speech_parameters.MODEL_DOMAIN_GENERIC
284+
)
285+
realtime_speech_parameters.encoding="audio/raw;rate=16000"
286+
realtime_speech_parameters.punctuation = (
287+
realtime_speech_parameters.PUNCTUATION_AUTO
288+
)
289+
```
290+
291+
Download a fully implemented python example [here.](./files/realtime_example_whisper.py)
248292
## Acknowledgements
249293
* **Authors**
294+
* Prabhutva Agrawal - Oracle AI Services
250295
* Alex Ginella - Oracle AI Services

0 commit comments

Comments
 (0)