huge commit. assistant code is now semifunctional

This commit is contained in:
Looki2000 2023-12-01 21:48:56 +01:00
parent af6f2a13b7
commit f18e41476f
7 changed files with 501 additions and 168 deletions

149
frontend/core.py Normal file
View File

@ -0,0 +1,149 @@
from vad_recorder import VADRecorder
from tts_stream import TTSStream
from faster_whisper import WhisperModel
import torch
import time
import numpy as np
class Core:
def __init__(self, whisper_model_name = "large-v3"):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print("\n=======================================")
print(f"Using {self.device.capitalize()} for:")
print(" - Faster Whisper")
print(" - TTS")
print("=======================================\n")
print("Loading Whisper model... ", end="")
self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16")
print("Done!")
# VADRecorder and TTSStream have their own console loading messages
self.vad_rec = VADRecorder()
self.tts = TTSStream(device=self.device)
def set_order_settings(self, phone_number, order_items, delivery_address):
self.phone_number = phone_number
self.order_items = order_items
self.delivery_address = delivery_address
def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
self.speech_recog_timeout = speech_recog_timeout
self.audio_input_device_name = audio_input_device_name
self.audio_output_device_name = audio_output_device_name
self.window_size_sec = window_size_sec
self.vad_threshold = vad_threshold
self.min_silence_duration_ms = min_silence_duration_ms
self.speech_pad_ms = speech_pad_ms
def set_tts_settings(self, speaker_wav):
self.speaker_wav = speaker_wav
def assistant_start(self):
print("Starting assistant...")
print("Setting TTS speaker... ", end="")
self.tts.change_speaker(self.speaker_wav)
print("Done!")
print("Starting VAD recording thread... ", end="")
self.vad_rec.start_vad_recorder(
target_device_name = self.audio_input_device_name,
window_size_sec = self.window_size_sec,
vad_threshold = self.vad_threshold,
min_silence_duration_ms = self.min_silence_duration_ms,
speech_pad_ms = self.speech_pad_ms
)
print("Done!")
print("LISTENING!!!")
last_recog_time = time.perf_counter()
speech_recog_text = ""
self.assistant_running = True
while self.assistant_running:
if self.vad_rec.speech:
last_recog_time = time.perf_counter()
if len(self.vad_rec.audios_for_whisper) > 0:
#stream_out.write(audios_for_whisper.pop(0))
audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32)
segments, _ = self.whisper_model.transcribe(audio, language="pl")
if not self.assistant_running:
break
text = "".join([segment.text for segment in segments])
#speech_recog_text += " " if len(speech_recog_text) else "" + text
if len(text) == 0:
continue
if not text[-1] in ".,!?":
text += "."
speech_recog_text += text.strip() + "\n"
print("=========================================")
print(text)
last_recog_time = time.perf_counter()
elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0:
speech_recog_text = speech_recog_text.strip()
print("=========================================\n\n")
print("-----------------------------------------")
print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!")
print("-----------------------------------------")
print(speech_recog_text)
print("-----------------------------------------\n\n")
time.sleep(1) # fake chatgpt delay
gpt_response = "czat dżi pi ti plejsholder 1 2 3"
if not self.assistant_running:
break
speech_recog_text = ""
# tts
print("Speech synthesis stream started!")
self.tts.tts_speak(gpt_response)
#print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text))
time.sleep(0.01)
# set assistant_running back to True to indicate that the loop has exited
def assistant_stop(self):
print("Stopping assistant... ", end="")
self.assistant_running = False
self.vad_rec.stop_vad_recorder()
print("Done!")

61
frontend/core_test.ipynb Normal file
View File

@ -0,0 +1,61 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from core import Core\n",
"\n",
"core = Core(whisper_model_name = \"large-v3\")\n",
"\n",
"core.set_order_settings(\n",
" phone_number = 123456789,\n",
" order_items = \"1x margharitta\\n2x sos majonezowy\",\n",
" delivery_address = \"ul. Amogusowa 1337, Suski Małe\"\n",
")\n",
"\n",
"core.set_speech_recog_settings(\n",
" speech_recog_timeout = 2.5,\n",
" audio_input_device_name = \"Virtual\",\n",
" audio_output_device_name = \"placeholder\",\n",
" window_size_sec = 0.1,\n",
" vad_threshold = 0.6,\n",
" min_silence_duration_ms = 150,\n",
" speech_pad_ms = 0\n",
")\n",
"\n",
"core.set_tts_settings(\n",
" speaker_wav = \"voices/lector.wav\"\n",
")\n",
"\n",
"try:\n",
" core.assistant_start()\n",
"except (KeyboardInterrupt, SystemExit):\n",
" core.assistant_stop()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -6,9 +6,9 @@
"metadata": {},
"outputs": [],
"source": [
"from tts_stream import TTSstream\n",
"from tts_stream import TTSStream\n",
"\n",
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
"tts = TTSStream(speaker_wav=\"voices/lector.wav\")"
]
},
{
@ -27,7 +27,7 @@
"metadata": {},
"outputs": [],
"source": [
"tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
"tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznych sieci neuronowych.\")\n"
]
}
],

View File

@ -1,5 +1,4 @@
import os
import torch
import pyaudio
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
@ -7,27 +6,35 @@ from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
import threading
import time
import re
# Check if CUDA is available
if torch.cuda.is_available():
print("Using CUDA")
device = "cuda"
else:
print("Using CPU")
device = "cpu"
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
class TTSstream:
def __init__(self, speaker_wav):
class TTSStream:
def __init__(self, speaker_wav=None, device=None):
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
if device is None:
import torch
# Check if CUDA is available
if torch.cuda.is_available():
print("Using CUDA")
device = "cuda"
else:
print("Using CPU")
device = "cpu"
#print(model_path)
print("Loading TTS model... ", end="")
#
# download model if it doesn't exist
if not os.path.exists(os.path.join(model_path, "config.json")):
print("Downloading model...")
print("Downloading model... ", end="")
tts = TTS()
tts.download_model_by_name(model_name=model_name)
@ -43,11 +50,16 @@ class TTSstream:
)
self.model.to(device)
print("Done!")
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
if speaker_wav is not None:
#self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
self.change_speaker(speaker_wav)
def change_speaker(self, speaker_wav):
print("Loading speaker... ", end="")
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
print("Done!")
def _write_stream(self):
# play first play_buffer_size samples and remove them from the buffer
@ -58,6 +70,7 @@ class TTSstream:
self.stream.write(self.chunk)
else:
if self.all_done:
#self.thread_ended = True
break
time.sleep(0.01)
@ -65,42 +78,60 @@ class TTSstream:
def tts_speak(self, text):
self.play_buffer_size = 512
chunks = self.model.inference_stream(
text,
"pl",
self.gpt_cond_latent,
self.speaker_embedding,
stream_chunk_size=20,
)
# open pyaudio stream
p = pyaudio.PyAudio()
self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)
# for each sentence ending with . or ! or ?
for text in re.split(r"(?<=[.!?])", text):
text = text.strip()
self.chunks_bin = b""
self.all_done = False
# run write_stream as thread
thread = threading.Thread(target=self._write_stream)
thread.start()
while True:
try:
# read chunks from chunks generator as they are generated
for self.chunk in chunks:
self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
break
# some weird error caused by coqui-tts
except:
print("Error occured when generating audio stream. Retrying...")
if len(text) == 0:
continue
self.all_done = True
chunks = self.model.inference_stream(
text,
"pl",
self.gpt_cond_latent,
self.speaker_embedding,
stream_chunk_size=20,
)
# wait for thread to finish
thread.join()
self.chunks_bin = b""
self.all_done = False
# run write_stream as thread
#self.thread_ended = False
thread = threading.Thread(target=self._write_stream)
thread.start()
while True:
try:
# read chunks from chunks generator as they are generated
for self.chunk in chunks:
self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
break
# some weird error caused by coqui-tts
except:
print("Error occured when generating audio stream. Retrying...")
continue
self.all_done = True
# wait for thread to finish
thread.join()
# wait for thread ended
#while not self.thread_ended:
# time.sleep(0.01)
#while True:
# if self.thread_ended:
# break
# print("Waiting for thread to end...")
# time.sleep(0.01)
self.stream.close()
p.terminate()

149
frontend/vad_recorder.py Normal file
View File

@ -0,0 +1,149 @@
import torch
import pyaudio
import numpy as np
import time
import onnxruntime as ort
import threading
ort.set_default_logger_severity(3)
SAMPLERATE = 16000
class VADRecorder:
#def __init__(self, target_device_name, window_size_sec = 0.2, use_onnx = True):
def __init__(self, use_onnx = True):
print("Loading Silero VAD model... ", end="")
self.vad_model, utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
force_reload=False,
onnx=use_onnx
)
(
_, # get_speech_timestamps
_, # save_audio
_, # read_audio
self.VADIterator,
_ # collect_chunks
) = utils
print("Done!")
self.vad_iterator = None
def _vad_recorder(self):
print("Listening...")
speech_win = 0
detected_audio = []
last_chunk = np.zeros(self.window_size, dtype=np.float32)
# Vad iterator needs to be reloaded because after running for a while, it freaks out and hallucinates speech.
vad_iter_reload_delay = 60 * 2
vad_iter_load_time = time.time()
self.vad_iterator = self.VADIterator(
self.vad_model,
threshold = self.vad_threshold,
sampling_rate = SAMPLERATE,
min_silence_duration_ms = self.min_silence_duration_ms,
speech_pad_ms = self.speech_pad_ms
)
while self.rec_flag:
chunk = np.frombuffer(self.stream_in.read(self.window_size), dtype=np.float32)
speech_dict = self.vad_iterator(chunk)
# check if speech_dict is {"start": x} ir {"end": x}
if speech_dict is not None:
self.speech = "start" in speech_dict
if self.speech:
#print("Speech detected!")
if speech_win == 0:
detected_audio = last_chunk.tolist()
speech_win += 1
detected_audio += chunk.tolist()
else:
if time.time() - vad_iter_load_time > vad_iter_reload_delay:
self.vad_iterator.reset_states()
vad_iter_load_time = time.time()
self.vad_iterator = self.VADIterator(
self.vad_model,
threshold = self.vad_threshold,
sampling_rate = SAMPLERATE,
min_silence_duration_ms = self.min_silence_duration_ms,
speech_pad_ms = self.speech_pad_ms
)
print("Reloaded VADIterator!")
if speech_win > 0:
speech_win = 0
self.audios_for_whisper.append(detected_audio)
last_chunk = chunk.copy()
def start_vad_recorder(self, target_device_name, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0):
self.window_size = int(window_size_sec * SAMPLERATE)
self.vad_threshold = vad_threshold
self.min_silence_duration_ms = min_silence_duration_ms
self.speech_pad_ms = speech_pad_ms
self.p = pyaudio.PyAudio()
target_device_index = None
for i in range(self.p.get_device_count()):
device_info = self.p.get_device_info_by_index(i)
if device_info['maxInputChannels'] > 0 and target_device_name in device_info['name']:
target_device_index = i
break
if target_device_index is None:
print(f"No target device found with \"{target_device_name}\" in its name.")
exit()
try:
self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size, input_device_index=target_device_index)
except OSError:
print(f"An unexpected error occured when trying to open device stream with \"{target_device_name}\" in its name. That could be caused by the device being disabled or unplugged.")
exit()
self.speech = False
self.audios_for_whisper = []
if self.vad_iterator is not None:
self.vad_iterator.reset_states()
self.rec_flag = True
self.vad_rec_thread = threading.Thread(target=self._vad_recorder, daemon=True)
self.vad_rec_thread.start()
def stop_vad_recorder(self):
self.rec_flag = False
self.vad_rec_thread.join()
self.stream_in.stop_stream()
self.stream_in.close()
self.p.terminate()

View File

@ -0,0 +1,69 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from vad_recorder import VADRecorder\n",
"import time\n",
"\n",
"# Instantiate WhisperWrap with the target device name and other parameters\n",
"vad_rec = VADRecorder()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Starting VAD recording thread\")\n",
"\n",
"try:\n",
" vad_rec.start_vad_recorder(target_device_name=\"Virtual\")\n",
"\n",
"\n",
" recordings_count = 0\n",
" now_speech = False\n",
"\n",
" print(\"Done!\")\n",
" while True:\n",
" if vad_rec.speech != now_speech:\n",
" now_speech = vad_rec.speech\n",
" print(f\"Speech: {now_speech}\")\n",
" if len(vad_rec.audios_for_whisper) != recordings_count:\n",
" recordings_count = len(vad_rec.audios_for_whisper)\n",
" print(f\"Recordings count: {recordings_count}\")\n",
"\n",
" time.sleep(0.01)\n",
"\n",
"except (KeyboardInterrupt, SystemExit):\n",
" print(\"Cleaning up...\")\n",
" vad_rec.stop_vad_recorder()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,126 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import whisper\n",
"from tts_stream import TTSstream"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sounddevice as sd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import torch\n",
"\n",
"# force matplotlib gui backend\n",
"import matplotlib\n",
"matplotlib.use('TkAgg')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Loading whisper model...\")\n",
"\n",
"if torch.cuda.is_available():\n",
" print(\"using CUDA\")\n",
" device = \"cuda\"\n",
"else:\n",
" print(\"using CPU\")\n",
" device = \"cpu\"\n",
"\n",
"model = whisper.load_model(\"medium\").to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# optional for changing speaker to some another one\n",
"tts.change_speaker(\"voices/speaker_name.wav\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# record 10 seconds of audio with sounddevice\n",
"print(\"Recording audio...\")\n",
"\n",
"fs = 16000\n",
"duration = 4\n",
"frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
"sd.wait()\n",
"\n",
"\n",
"\n",
"frames = frames[:, 0]\n",
"#frames /= np.max(np.abs(frames))\n",
"\n",
"\n",
"## plot audio\n",
"#plt.plot(frames)\n",
"## set plot range to -1, 1\n",
"#plt.ylim(-1, 1)\n",
"#plt.show()\n",
"\n",
"# recognize text from audio\n",
"print(\"Recognizing text...\")\n",
"\n",
"result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
"whisper_text = result[\"text\"]\n",
"print(whisper_text)\n",
"\n",
"# synthesize text to audio\n",
"print(\"Synthesizing audio...\")\n",
"tts.tts_speak(whisper_text)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}