oracle-livelabs
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example.py‎ renamed to ‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example_oracle.py‎
Lines changed: 1 addition & 0 deletions b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example.py‎ renamed to ‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example_oracle.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example_whisper.py‎
Lines changed: 159 additions & 0 deletions b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/realtime_example_whisper.py‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/requirements.txt‎
Lines changed: 1 addition & 1 deletion b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/files/requirements.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/click-live-transcribe.png‎
681 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/click-live-transcribe.png‎
681 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/click-start-session.png‎
163 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/click-start-session.png‎
163 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/configure-transcription.png‎
179 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/configure-transcription.png‎
179 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/enable-customizations.png‎
283 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/enable-customizations.png‎
283 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/live-transcription-results.png‎
286 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/live-transcription-results.png‎
286 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/navigate-to-ai-speech-menu.png‎
343 KB b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/images/navigate-to-ai-speech-menu.png‎
343 KB
diff --git a/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/transcribe-live-audio.md‎
Lines changed: 75 additions & 30 deletions b/‎oci-artificial-intelligence/ai-speech/transcribe-live-audio/transcribe-live-audio.md‎
Lines changed: 75 additions & 30 deletions
@@ -124,6 +124,7 @@ def message_callback(message):
     realtime_speech_parameters.model_domain = (
         realtime_speech_parameters.MODEL_DOMAIN_GENERIC
     )
+    realtime_speech_parameters.model_type = "ORACLE"
     realtime_speech_parameters.partial_silence_threshold_in_ms = 0
     realtime_speech_parameters.final_silence_threshold_in_ms = 2000
     realtime_speech_parameters.encoding="audio/raw;rate=16000"
 
@@ -0,0 +1,159 @@
+import asyncio
+import pyaudio
+from collections import deque
+import oci
+from oci.config import from_file
+from oci.auth.signers.security_token_signer import SecurityTokenSigner
+
+from oci_ai_speech_realtime import (
+    RealtimeSpeechClient, 
+    RealtimeSpeechClientListener
+)
+
+from oci.ai_speech.models import (
+    RealtimeParameters,
+)
+
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+# Create a logger for this module
+logger = logging.getLogger(__name__)
+
+# This is needed for wss connections
+# import truststore
+# truststore.inject_into_ssl()
+
+# Create a FIFO queue
+queue = asyncio.Queue()
+
+# Set audio parameters
+SAMPLE_RATE = 16000
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+BUFFER_DURATION_MS = 96
+
+# Calculate the number of frames per buffer
+FRAMES_PER_BUFFER = int(SAMPLE_RATE * BUFFER_DURATION_MS / 1000)
+
+
+def authenticator():
+    config =oci.config.from_file()
+    with open(config["security_token_file"], "r") as f:
+        token = f.readline()
+
+    private_key = oci.signer.load_private_key_from_file(config["key_file"])
+
+    return SecurityTokenSigner(token=token, private_key=private_key)
+
+
+def audio_callback(in_data, frame_count, time_info, status):
+    # This function will be called by PyAudio when there's new audio data
+    queue.put_nowait(in_data)
+    return (None, pyaudio.paContinue)
+
+
+# Create a PyAudio object
+p = pyaudio.PyAudio()
+
+# Open the stream
+stream = p.open(
+    format=FORMAT,
+    channels=CHANNELS,
+    rate=SAMPLE_RATE,
+    input=True,
+    frames_per_buffer=FRAMES_PER_BUFFER,
+    stream_callback=audio_callback,
+)
+
+stream.start_stream()
+config = from_file()
+
+
+async def send_audio(client):
+    i = 0
+    # loop = asyncio.get_running_loop()
+    while True:
+        data = await queue.get()
+
+        # Send it over the websocket
+        await client.send_data(data)
+        i += 1
+
+    # stream.close()
+    # client.close()
+
+
+class MyListener(RealtimeSpeechClientListener):
+    def on_result(self, result):
+        if result["transcriptions"][0]["isFinal"]:
+            logger.info(
+                f"Received final results: {result['transcriptions'][0]['transcription']}"
+            )
+        # partial results are currently supported by ORACLE model only
+        # else:
+        #     logger.info(
+        #         f"Received partial results: {result['transcriptions'][0]['transcription']}"
+        #     )
+
+    def on_ack_message(self, ackmessage):
+        return super().on_ack_message(ackmessage)
+
+    def on_connect(self):
+        return super().on_connect()
+
+    def on_connect_message(self, connectmessage):
+        return super().on_connect_message(connectmessage)
+
+    def on_network_event(self, ackmessage):
+        return super().on_network_event(ackmessage)
+
+    def on_error(self):
+        return super().on_error()
+    
+    def on_close(self, error_code, error_message):
+        return super().on_close(error_code, error_message)
+
+
+if __name__ == "__main__":
+    # Run the event loop
+    def message_callback(message):
+        logger.info(f"Received message: {message}")
+
+    realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
+    realtime_speech_parameters.language_code = "en"
+    realtime_speech_parameters.model_domain = (
+        realtime_speech_parameters.MODEL_DOMAIN_GENERIC
+    )
+    realtime_speech_parameters.model_type = "WHISPER"
+    realtime_speech_parameters.encoding="audio/raw;rate=16000"
+    realtime_speech_parameters.punctuation = (
+        realtime_speech_parameters.PUNCTUATION_AUTO
+    )
+
+    realtime_speech_url = "wss://realtime.aiservice.uk-london-1.oci.oraclecloud.com"
+    client =  RealtimeSpeechClient(
+            realtime_speech_parameters=realtime_speech_parameters,
+            config=config,
+            listener=MyListener(),
+            service_endpoint=realtime_speech_url,
+            signer=authenticator(),
+            compartment_id="<compartmentID>",
+        )
+
+    loop = asyncio.get_event_loop()
+    loop.create_task(send_audio(client))
+    loop.run_until_complete(client.connect())
+
+    """
+    Optionally request the final result on demand.
+    The below code snippet will request a final result.
+    
+    await client.request_final_result()
+    
+    """
+
+    if stream.is_active():
+        stream.close()
+
+    logger.info("Closed now")
@@ -1,2 +1,2 @@
 PyAudio>=0.2.14
-oci-ai-speech-realtime>=2.1.0
+oci-ai-speech-realtime>=2.2.0
@@ -3,26 +3,28 @@
 ## Introduction
 In this session, we will help users get familiar with OCI Speech live transcribe and teach them how to use our services via the cloud console.
 
-***Estimated Lab Time***: 5 minutes
+***Estimated Lab Time***: 30 minutes
 
 ### Objectives
 
 In this lab, you will:
 - Learn how to transcribe live audio to text from the OCI Console
 - Invoke custom vocabulary (customizations) in the OCI Console
+- Learn how to use OCI AI Speech Realtime Python SDK to create live transcription sessions
 
 ### Prerequisites:
 - A Free tier or paid tenancy account in OCI (Oracle Cloud Infrastructure)
 - Tenancy is whitelisted to be able to use OCI Speech
 
 ## Task 1: Navigate to Overview Page
 
-Log into OCI Cloud Console. Using the Burger Menu on the top left corner, navigate to Analytics and AI menu and click it, and then select Language item under AI services.
+Log into OCI Console. Using the Burger Menu on the top left corner, navigate to **Analytics and AI** menu, and then select **Speech** under **AI Services**.
     ![Navigate speech service menu](./images/navigate-to-ai-speech-menu.png " ")
 
-This will navigate you to the transcription jobs overview page.
-On the left you can toggle between overview and transcription jobs listing page.
-Under documentation you can find helpful links relevant to OCI speech service
+This will navigate you to the Speech overview page.
+From the left you can navigate to various OCI Speech offerings.
+
+From the Documentation section you can find helpful links relevant to OCI Speech service
     ![Speech service overview page](./images/overview-page.png " ")
 
 
@@ -46,25 +48,34 @@ Under documentation you can find helpful links relevant to OCI speech service
 
 To change transcription parameters, look to the <strong>Configure transcription</strong> menu to the right
 
-1. Configure transcription
+### Configure transcription
+
+Here you can change parameters such as transcription model type, model domain, audio language, punctuation, partial and final silence thresholds, partial results stability and enable customizations
+    ![Configure transcription](./images/configure-transcription.png " ")
 
-    Here you can change parameters such as transcription model type, audio language, punctuation, partial and final silence thresholds partial results stability and enable customizations
-        ![Configure transcription](./images/configure-transcription.png " ")
+- <strong>Model type:</strong> Use this parameter to select a model to use for generating transcriptions. Currently supported model types are: `ORACLE` and `WHISPER`
+
+    > Note: Partial results are only supported by `ORACLE` model
+    
+- <strong>Model domain:</strong> Use this parameter to configure the transcription model for specialized audio, e.g. audio that features specific media terminology. Currently supported model domains are: `GENERIC` and `MEDICAL`
 
-    <strong>Choose domain:</strong> Use this parameter to configure the transcription model for specialized audio, e.g. audio that features specific medial terminology
+    > Note: `MEDICAL` domain is only supported by `ORACLE` model
     
-    <strong>Choose language:</strong> Use this parameter to configure the language of the speaker
+- <strong>Language:</strong> Use this parameter to configure the trancription language. `WHISPER` model supports automatic language detection.
 
-    <strong>Choose punctuation:</strong> Use this parameter to configure the punctuation mode for the transcription model
+- <strong>Punctuation:</strong> Use this parameter to configure the punctuation mode for the transcription model. Currently supported punctuation modes are: `NONE`, `AUTO` and `SPOKEN`
 
-    <strong>Partial silence threshold:</strong> Use this parameter to configure how quickly partial results should be 
-    returned
+    > Note: Punctuation mode `SPOKEN` is only supported by `MEDICAL` domain
+
+Following parameters are only supported by `ORACLE` model:
+
+- <strong>Partial silence threshold:</strong> Use this parameter to configure how quickly partial results should be returned. Value ranges from `0` to `2000` milliseconds.
 
-    <strong>Final silence threshold:</strong> Use this parameter to configure how long to wait before a partial result is finalized 
+- <strong>Final silence threshold:</strong> Use this parameter to configure how long to wait before a partial result is finalized. Value ranges from `0` to `5000` milliseconds.
 
-    <strong>Partial results stability:</strong> Use this parameter to configure the stability of partial results (amount of confidence required before returning a partial result)
+- <strong>Partial results stability:</strong> Use this parameter to configure the stability of partial results (amount of confidence required before returning a partial result). Allowed values are `NONE`, `LOW`, `MEDIUM` and `HIGH`.
 
-    <strong>Enable customizations:</strong> Check this box to choose a customization to use during your transcription session
+- <strong>Enable customizations:</strong> Check this box to choose a customization to use during your transcription session.
 
 ## Task 4: Enabling a customization
 
@@ -99,9 +110,10 @@ Alternatively, both the required packages can be installed using the `requiremen
 pip install -r requirements.txt
 ```
 
+### Python example:
+
 OCI AI Speech live transcription uses websockets to relay audio data and receive text transcriptions in real time. This means your client must implement some key listener functions:
 
-<strong>Python example:</strong>
 ```
 on_result(result)
     // This function will be called whenever a result is returned from the 
@@ -133,7 +145,7 @@ on_close(error_code, error_message) (optional)
     // Its implementation is not required
 ```
 
-Example implementation of listener functions:
+**Example implementation of listener functions:**
 ```
 class MyRealtimeListener(RealtimeClientListener):
     result = []
@@ -189,31 +201,34 @@ class MyRealtimeListener(RealtimeClientListener):
 
 <strong>Realtime client parameters</strong> can be set and included in your realtime client to change the behavior of your transcription session.
 
-*example values:*
+### Sample values for model type `ORACLE`:
 
-`language_code` : <strong>"en-US"</strong>
+- `language_code` : <strong>"en-US"</strong>
 
-`model_domain` : <strong>"GENERIC"</strong>
+- `model_type` : <strong>"ORACLE"</strong>
 
-`partial_silence_threshold_in_ms` : <strong>0</strong>
+- `model_domain` : <strong>"GENERIC"</strong>
 
-`final_silence_threshold_in_ms` : <strong>2000</strong>
+- `partial_silence_threshold_in_ms` : <strong>0</strong>
 
-`encoding` : <strong>"audio/raw;rate=16000"</strong>
+- `final_silence_threshold_in_ms` : <strong>2000</strong>
 
-`punctuation` : <strong>AUTO</strong>
+- `encoding` : <strong>"audio/raw;rate=16000"</strong>
 
-`should_ignore_invalid_customizations` : <strong>True</strong>
+- `punctuation` : <strong>AUTO</strong>
 
-`stabilize_partial_results` : <strong>True</strong>
+- `should_ignore_invalid_customizations` : <strong>True</strong>
 
-`customizations` : <strong>[Customization1]</strong>
+- `stabilize_partial_results` : <strong>True</strong>
 
-<strong>Example of setting realtime parameters</strong>
+- `customizations` : <strong>[Customization1]</strong>
+
+<strong>Sample implementation of setting realtime parameters for model type `ORACLE`:</strong>
 
 ```
 realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
 realtime_speech_parameters.language_code = "en-US"
+realtime_speech_parameters.model_type = "ORACLE"
 realtime_speech_parameters.model_domain = (
     realtime_speech_parameters.MODEL_DOMAIN_GENERIC
 )
@@ -244,7 +259,37 @@ realtime_speech_parameters.customizations = [
 ]
 ```
 
-Download a fully implemented python example [here.](./files/realtime_example.py)
+Download a fully implemented python example [here.](./files/realtime_example_oracle.py)
+
+### Sample values for model type `WHISPER`:
+
+- `language_code` : <strong>"en"</strong>
+
+- `model_type` : <strong>"WHISPER"</strong>
+
+- `model_domain` : <strong>"GENERIC"</strong>
+
+- `encoding` : <strong>"audio/raw;rate=16000"</strong>
+
+- `punctuation` : <strong>AUTO</strong>
+
+<strong>Sample implementation of setting realtime parameters for model type `WHISPER`:</strong>
+
+```
+realtime_speech_parameters: RealtimeParameters = RealtimeParameters()
+realtime_speech_parameters.language_code = "en"
+realtime_speech_parameters.model_type = "WHISPER"
+realtime_speech_parameters.model_domain = (
+    realtime_speech_parameters.MODEL_DOMAIN_GENERIC
+)
+realtime_speech_parameters.encoding="audio/raw;rate=16000"
+realtime_speech_parameters.punctuation = (
+    realtime_speech_parameters.PUNCTUATION_AUTO
+)
+```
+
+Download a fully implemented python example [here.](./files/realtime_example_whisper.py)
 ## Acknowledgements
 * **Authors**
+    * Prabhutva Agrawal - Oracle AI Services
     * Alex Ginella  - Oracle AI Services
Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def message_callback(message):`
`124`	`124`	`realtime_speech_parameters.model_domain = (`
`125`	`125`	`realtime_speech_parameters.MODEL_DOMAIN_GENERIC`
`126`	`126`	`)`
	`127`	`+ realtime_speech_parameters.model_type = "ORACLE"`
`127`	`128`	`realtime_speech_parameters.partial_silence_threshold_in_ms = 0`
`128`	`129`	`realtime_speech_parameters.final_silence_threshold_in_ms = 2000`
`129`	`130`	`realtime_speech_parameters.encoding="audio/raw;rate=16000"`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`PyAudio>=0.2.14`
`2`		`-oci-ai-speech-realtime>=2.1.0`
	`2`	`+oci-ai-speech-realtime>=2.2.0`