-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep_04_whisper_with_chat_gpt.py
121 lines (94 loc) · 3.35 KB
/
step_04_whisper_with_chat_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import asyncio
import pyaudio
import wave
import whisper
from pydub import AudioSegment
from dotenv import load_dotenv
import openai
import os
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
WAKE_TERM = os.getenv("WAKETERM")
# Parameters for recording audio
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 2048
# Initialize the audio interface
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
# Load the Whisper model
model = whisper.load_model("medium")
def get_completion(prompt, model="gpt-4"):
messages = [{"role": "user", "content": prompt}]
response = openai.ChatCompletion.create(
model=model,
messages=messages,
temperature=0,
# this is the degree of randomness of the model's output
)
return response.choices[0].message["content"]
async def record_audio(filename, duration=4):
"""
An asynchronous generator that records audio continuously and yields
filenames of audio chunks. Each chunk is approximately 4 seconds long.
"""
while True:
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
with wave.open(filename, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(audio.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
yield filename
await asyncio.sleep(0)
async def transcribe_audio():
"""
Continuously transcribes audio from the microphone. Audio is recorded
in chunks (approximately 4 seconds each), and each chunk is transcribed
separately.
"""
audio_generator = record_audio("chunk.wav")
async for filename in audio_generator:
audio = whisper.load_audio(filename)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
if WAKE_TERM in result.text.lower():
await wake_word_detected()
print("Transcription:", result.text)
async def wake_word_detected():
"""
Function to be called when the wake word is detected. It will listen
for 20 seconds or until a 2-second silence is detected.
"""
print("Wake word detected. Listening for command...")
audio_generator = record_audio("command.wav", duration=10)
async for filename in audio_generator:
audio_segment = AudioSegment.from_wav(filename)
silence_threshold = -40 # dB
if audio_segment.dBFS < silence_threshold:
print("Silence detected. Stopping recording...")
break
audio = whisper.load_audio(filename)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
print("Command transcription:", result.text)
response = get_completion(result.text)
print("Response:", response)
break
def main():
"""
The main function of the script. Starts the transcription process.
"""
asyncio.run(transcribe_audio())
if __name__ == "__main__":
main()