Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add ai speech sample #163

Merged
merged 1 commit into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ AZURE_BLOB_CONTAINER_NAME="audio"
# Azure AI Speech
AZURE_AI_SPEECH_API_ENDPOINT="https://<speech-api-name>.cognitiveservices.azure.com/"
AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY="<speech-api-subscription-key>"
AZURE_AI_SPEECH_API_REGION="eastus"

# Bing search resource
BING_SUBSCRIPTION_KEY="<bing-subscription-key>"
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,5 @@ generated/
*.jpg
*.jpeg
.chroma
.stop
.transcribed.txt
26 changes: 26 additions & 0 deletions apps/14_streamlit_azure_ai_speech/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Realtime transcription with Azure AI Speech Service

This app demonstrates how to use Azure AI Speech Service for realtime transcription.

## Prerequisites

- Python 3.10 or later
- Azure AI Speech Service
- Azure OpenAI Service

## Overview

```shell
# Speech to Text script
poetry run python apps/14_streamlit_azure_ai_speech/speech_to_text.py --help

# WIP: Streamlit app
poetry run python -m streamlit run apps/14_streamlit_azure_ai_speech/main.py
```

# References

- [How to recognize speech](https://learn.microsoft.com/azure/ai-services/speech-service/how-to-recognize-speech?pivots=programming-language-python)
- [Quickstart: Create real-time diarization](https://learn.microsoft.com/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python)
- [Speech to text containers with Docker](https://learn.microsoft.com/azure/ai-services/speech-service/speech-container-stt?tabs=container&pivots=programming-language-python)
- [AzureSpeechService でリアルタイム議事録](https://zenn.dev/o_ken_surprise/articles/991f5b592b91ee)
210 changes: 210 additions & 0 deletions apps/14_streamlit_azure_ai_speech/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
import pathlib
import subprocess
from os import getenv

import streamlit as st
from dotenv import load_dotenv
from openai import AzureOpenAI

load_dotenv()

# Initialize the session state
if "transcribed_result" not in st.session_state:
st.session_state["transcribed_result"] = ""

with st.sidebar:
inference_type = st.selectbox(
label="INEFERENCE_TYPE",
options=[
"azure",
"local",
],
key="INEFERENCE_TYPE",
)
azure_ai_speech_api_language = st.selectbox(
label="AZURE_AI_SPEECH_API_LANGUAGE",
options=[
"en-US",
"ja-JP",
],
key="AZURE_AI_SPEECH_API_LANGUAGE",
)
if inference_type == "local":
path_to_model = st.text_input(
label="PATH_TO_MODEL",
value="./model",
key="PATH_TO_MODEL",
type="default",
)
stt_host = st.text_input(
label="STT_HOST",
value="ws://localhost:5000",
key="STT_HOST",
type="default",
)
st.warning("yet to be implemented")
if inference_type == "azure":
azure_openai_endpoint = st.text_input(
label="AZURE_OPENAI_ENDPOINT",
value=getenv("AZURE_OPENAI_ENDPOINT"),
key="AZURE_OPENAI_ENDPOINT",
type="default",
)
azure_openai_api_key = st.text_input(
label="AZURE_OPENAI_API_KEY",
value=getenv("AZURE_OPENAI_API_KEY"),
key="AZURE_OPENAI_API_KEY",
type="password",
)
azure_openai_api_version = st.text_input(
label="AZURE_OPENAI_API_VERSION",
value=getenv("AZURE_OPENAI_API_VERSION"),
key="AZURE_OPENAI_API_VERSION",
type="default",
)
azure_openai_gpt_model = st.text_input(
label="AZURE_OPENAI_GPT_MODEL",
value=getenv("AZURE_OPENAI_GPT_MODEL"),
key="AZURE_OPENAI_GPT_MODEL",
type="default",
)
azure_ai_speech_api_subscription_key = st.text_input(
label="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
value=getenv("AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY"),
key="AZURE_AI_SPEECH_API_SUBSCRIPTION_KEY",
type="password",
)
azure_ai_speech_api_region = st.text_input(
label="AZURE_AI_SPEECH_API_REGION",
value=getenv("AZURE_AI_SPEECH_API_REGION"),
key="AZURE_AI_SPEECH_API_REGION",
type="default",
)
"[Azure Portal](https://portal.azure.com/)"
"[Azure OpenAI Studio](https://oai.azure.com/resource/overview)"
"[View the source code](https://github.com/ks6088ts-labs/workshop-azure-openai/blob/main/apps/14_streamlit_azure_ai_speech/main.py)"


def is_configured():
if inference_type == "local":
return path_to_model and stt_host
if inference_type == "azure":
return azure_openai_api_key and azure_openai_endpoint and azure_openai_api_version and azure_openai_gpt_model


st.title("transcribe text")

if not is_configured():
st.warning("Please fill in the required fields at the sidebar.")

st.info("This is a sample to transcribe text.")

# ---
# 2 column layout

# 1st row
row1_left, row1_right = st.columns(2)
with row1_left:
input = st.text_area(
"Transcribed text",
height=400,
placeholder="Please enter the text to transcribe.",
key="input",
value=st.session_state["transcribed_result"],
)

with row1_right:
start_transcribe_button = st.button("start", disabled=not is_configured())
stop_transcribe_button = st.button("stop", disabled=not is_configured())
transcription_status = st.empty()

# line break horizontal line
st.markdown("---")

# 2nd row
row2_left, row2_right = st.columns(2)

with row2_left:
selected_task = st.selectbox(
"Task",
[
"Create summaries from the following text",
"Extract 3 main points from the following text",
# Add more tasks here
],
key="selected_task",
index=0,
)

with row2_right:
run_task_button = st.button("run_task", disabled=not is_configured())

path_to_transcribed_text = ".transcribed.txt"


def start_recognition():
global process
if inference_type == "local":
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --endpoint {stt_host} --language {azure_ai_speech_api_language} --type local --verbose" # noqa
process = subprocess.Popen(command, shell=True)
st.warning("Local inference is not yet implemented.")
return
if inference_type == "azure":
command = f"python apps/14_streamlit_azure_ai_speech/speech_to_text.py --output {path_to_transcribed_text} --subscription {azure_ai_speech_api_subscription_key} --region {azure_ai_speech_api_region} --language {azure_ai_speech_api_language} --type azure --verbose" # noqa
process = subprocess.Popen(command, shell=True)


def run_task(selected_task: str, input: str) -> str:
if inference_type == "local":
st.warning("Local inference is not yet implemented.")
return
if inference_type == "azure":
client = AzureOpenAI(
api_key=azure_openai_api_key,
api_version=azure_openai_api_version,
azure_endpoint=azure_openai_endpoint,
)

response = client.chat.completions.create(
model=azure_openai_gpt_model,
messages=[
{
"role": "system",
"content": f"""
Task: {selected_task}.
---
{input}
---
""",
},
],
)
return response.choices[0].message.content
raise ValueError(f"Inference type is not supported: {inference_type}")


def load_transcribed_text():
with open(path_to_transcribed_text) as f:
return f.read()


if start_transcribe_button:
if not st.session_state.get("process"):
transcription_status.info(f"Transcribing... (language={azure_ai_speech_api_language})")
start_recognition()
else:
transcription_status.warning("Transcription is already running.")

if stop_transcribe_button:
pathlib.Path(".stop").touch()
output = load_transcribed_text()
st.session_state.transcribed_result = output
st.rerun()

if run_task_button:
with st.spinner("Running..."):
output = run_task(
selected_task=selected_task,
input=input,
)
st.write(output)
Loading