From f18e41476fd676aa6e0fa7d4c086bfce90f99985 Mon Sep 17 00:00:00 2001 From: Looki2000 Date: Fri, 1 Dec 2023 21:48:56 +0100 Subject: [PATCH] huge commit. assistant code is now semifunctional --- frontend/core.py | 149 +++++++++++++++++++++++++++++++ frontend/core_test.ipynb | 61 +++++++++++++ frontend/tts test.ipynb | 6 +- frontend/tts_stream.py | 109 ++++++++++++++-------- frontend/vad_recorder.py | 149 +++++++++++++++++++++++++++++++ frontend/vad_recorder_test.ipynb | 69 ++++++++++++++ frontend/whisper_tts_test.ipynb | 126 -------------------------- 7 files changed, 501 insertions(+), 168 deletions(-) create mode 100644 frontend/core.py create mode 100644 frontend/core_test.ipynb create mode 100644 frontend/vad_recorder.py create mode 100644 frontend/vad_recorder_test.ipynb delete mode 100644 frontend/whisper_tts_test.ipynb diff --git a/frontend/core.py b/frontend/core.py new file mode 100644 index 0000000..551a939 --- /dev/null +++ b/frontend/core.py @@ -0,0 +1,149 @@ +from vad_recorder import VADRecorder +from tts_stream import TTSStream +from faster_whisper import WhisperModel +import torch +import time +import numpy as np + + + +class Core: + def __init__(self, whisper_model_name = "large-v3"): + self.device = "cuda" if torch.cuda.is_available() else "cpu" + print("\n=======================================") + print(f"Using {self.device.capitalize()} for:") + print(" - Faster Whisper") + print(" - TTS") + print("=======================================\n") + + print("Loading Whisper model... ", end="") + self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16") + print("Done!") + + # VADRecorder and TTSStream have their own console loading messages + self.vad_rec = VADRecorder() + self.tts = TTSStream(device=self.device) + + + def set_order_settings(self, phone_number, order_items, delivery_address): + self.phone_number = phone_number + self.order_items = order_items + self.delivery_address = delivery_address + + + def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms): + self.speech_recog_timeout = speech_recog_timeout + self.audio_input_device_name = audio_input_device_name + self.audio_output_device_name = audio_output_device_name + self.window_size_sec = window_size_sec + self.vad_threshold = vad_threshold + self.min_silence_duration_ms = min_silence_duration_ms + self.speech_pad_ms = speech_pad_ms + + + def set_tts_settings(self, speaker_wav): + self.speaker_wav = speaker_wav + + + def assistant_start(self): + print("Starting assistant...") + + print("Setting TTS speaker... ", end="") + self.tts.change_speaker(self.speaker_wav) + print("Done!") + + print("Starting VAD recording thread... ", end="") + self.vad_rec.start_vad_recorder( + target_device_name = self.audio_input_device_name, + window_size_sec = self.window_size_sec, + vad_threshold = self.vad_threshold, + min_silence_duration_ms = self.min_silence_duration_ms, + speech_pad_ms = self.speech_pad_ms + ) + print("Done!") + + + + + + print("LISTENING!!!") + + + last_recog_time = time.perf_counter() + speech_recog_text = "" + + self.assistant_running = True + + while self.assistant_running: + if self.vad_rec.speech: + last_recog_time = time.perf_counter() + + if len(self.vad_rec.audios_for_whisper) > 0: + #stream_out.write(audios_for_whisper.pop(0)) + audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32) + + segments, _ = self.whisper_model.transcribe(audio, language="pl") + if not self.assistant_running: + break + + text = "".join([segment.text for segment in segments]) + #speech_recog_text += " " if len(speech_recog_text) else "" + text + + if len(text) == 0: + continue + + if not text[-1] in ".,!?": + text += "." + + speech_recog_text += text.strip() + "\n" + + print("=========================================") + print(text) + + last_recog_time = time.perf_counter() + + + + + + elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0: + speech_recog_text = speech_recog_text.strip() + + print("=========================================\n\n") + print("-----------------------------------------") + print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!") + print("-----------------------------------------") + print(speech_recog_text) + print("-----------------------------------------\n\n") + + time.sleep(1) # fake chatgpt delay + + gpt_response = "czat dżi pi ti plejsholder 1 2 3" + if not self.assistant_running: + break + + speech_recog_text = "" + + + # tts + print("Speech synthesis stream started!") + self.tts.tts_speak(gpt_response) + + + #print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text)) + + + time.sleep(0.01) + + # set assistant_running back to True to indicate that the loop has exited + + def assistant_stop(self): + print("Stopping assistant... ", end="") + + self.assistant_running = False + + self.vad_rec.stop_vad_recorder() + + print("Done!") + + diff --git a/frontend/core_test.ipynb b/frontend/core_test.ipynb new file mode 100644 index 0000000..42e3c5f --- /dev/null +++ b/frontend/core_test.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from core import Core\n", + "\n", + "core = Core(whisper_model_name = \"large-v3\")\n", + "\n", + "core.set_order_settings(\n", + " phone_number = 123456789,\n", + " order_items = \"1x margharitta\\n2x sos majonezowy\",\n", + " delivery_address = \"ul. Amogusowa 1337, Suski Małe\"\n", + ")\n", + "\n", + "core.set_speech_recog_settings(\n", + " speech_recog_timeout = 2.5,\n", + " audio_input_device_name = \"Virtual\",\n", + " audio_output_device_name = \"placeholder\",\n", + " window_size_sec = 0.1,\n", + " vad_threshold = 0.6,\n", + " min_silence_duration_ms = 150,\n", + " speech_pad_ms = 0\n", + ")\n", + "\n", + "core.set_tts_settings(\n", + " speaker_wav = \"voices/lector.wav\"\n", + ")\n", + "\n", + "try:\n", + " core.assistant_start()\n", + "except (KeyboardInterrupt, SystemExit):\n", + " core.assistant_stop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/frontend/tts test.ipynb b/frontend/tts test.ipynb index 41f3484..af8021b 100644 --- a/frontend/tts test.ipynb +++ b/frontend/tts test.ipynb @@ -6,9 +6,9 @@ "metadata": {}, "outputs": [], "source": [ - "from tts_stream import TTSstream\n", + "from tts_stream import TTSStream\n", "\n", - "tts = TTSstream(speaker_wav=\"voices/lector.wav\")" + "tts = TTSStream(speaker_wav=\"voices/lector.wav\")" ] }, { @@ -27,7 +27,7 @@ "metadata": {}, "outputs": [], "source": [ - "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n" + "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznych sieci neuronowych.\")\n" ] } ], diff --git a/frontend/tts_stream.py b/frontend/tts_stream.py index 0d7ab08..65e11ea 100644 --- a/frontend/tts_stream.py +++ b/frontend/tts_stream.py @@ -1,5 +1,4 @@ import os -import torch import pyaudio from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig @@ -7,27 +6,35 @@ from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir import threading import time +import re + -# Check if CUDA is available -if torch.cuda.is_available(): - print("Using CUDA") - device = "cuda" -else: - print("Using CPU") - device = "cpu" model_name = "tts_models/multilingual/multi-dataset/xtts_v2" -class TTSstream: - def __init__(self, speaker_wav): +class TTSStream: + def __init__(self, speaker_wav=None, device=None): model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) + if device is None: + import torch + + # Check if CUDA is available + if torch.cuda.is_available(): + print("Using CUDA") + device = "cuda" + else: + print("Using CPU") + device = "cpu" + #print(model_path) + + print("Loading TTS model... ", end="") # # download model if it doesn't exist if not os.path.exists(os.path.join(model_path, "config.json")): - print("Downloading model...") + print("Downloading model... ", end="") tts = TTS() tts.download_model_by_name(model_name=model_name) @@ -43,11 +50,16 @@ class TTSstream: ) self.model.to(device) + print("Done!") - self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav) + if speaker_wav is not None: + #self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav) + self.change_speaker(speaker_wav) def change_speaker(self, speaker_wav): + print("Loading speaker... ", end="") self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav) + print("Done!") def _write_stream(self): # play first play_buffer_size samples and remove them from the buffer @@ -58,6 +70,7 @@ class TTSstream: self.stream.write(self.chunk) else: if self.all_done: + #self.thread_ended = True break time.sleep(0.01) @@ -65,42 +78,60 @@ class TTSstream: def tts_speak(self, text): self.play_buffer_size = 512 - chunks = self.model.inference_stream( - text, - "pl", - self.gpt_cond_latent, - self.speaker_embedding, - stream_chunk_size=20, - ) - # open pyaudio stream p = pyaudio.PyAudio() self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True) + # for each sentence ending with . or ! or ? + for text in re.split(r"(?<=[.!?])", text): + text = text.strip() - self.chunks_bin = b"" - self.all_done = False - - # run write_stream as thread - thread = threading.Thread(target=self._write_stream) - thread.start() - - while True: - try: - # read chunks from chunks generator as they are generated - for self.chunk in chunks: - self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes() - break - # some weird error caused by coqui-tts - except: - print("Error occured when generating audio stream. Retrying...") + if len(text) == 0: continue - self.all_done = True + chunks = self.model.inference_stream( + text, + "pl", + self.gpt_cond_latent, + self.speaker_embedding, + stream_chunk_size=20, + ) - # wait for thread to finish - thread.join() + + self.chunks_bin = b"" + self.all_done = False + + # run write_stream as thread + #self.thread_ended = False + thread = threading.Thread(target=self._write_stream) + thread.start() + + while True: + try: + # read chunks from chunks generator as they are generated + for self.chunk in chunks: + self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes() + break + # some weird error caused by coqui-tts + except: + print("Error occured when generating audio stream. Retrying...") + continue + + self.all_done = True + + # wait for thread to finish + thread.join() + + # wait for thread ended + #while not self.thread_ended: + # time.sleep(0.01) + + #while True: + # if self.thread_ended: + # break + # print("Waiting for thread to end...") + # time.sleep(0.01) self.stream.close() p.terminate() diff --git a/frontend/vad_recorder.py b/frontend/vad_recorder.py new file mode 100644 index 0000000..192b9cd --- /dev/null +++ b/frontend/vad_recorder.py @@ -0,0 +1,149 @@ +import torch +import pyaudio +import numpy as np +import time +import onnxruntime as ort +import threading + +ort.set_default_logger_severity(3) + + +SAMPLERATE = 16000 + +class VADRecorder: + #def __init__(self, target_device_name, window_size_sec = 0.2, use_onnx = True): + def __init__(self, use_onnx = True): + + print("Loading Silero VAD model... ", end="") + + self.vad_model, utils = torch.hub.load( + repo_or_dir="snakers4/silero-vad", + model="silero_vad", + force_reload=False, + onnx=use_onnx + ) + + ( + _, # get_speech_timestamps + _, # save_audio + _, # read_audio + self.VADIterator, + _ # collect_chunks + ) = utils + + print("Done!") + + self.vad_iterator = None + + + + def _vad_recorder(self): + print("Listening...") + + speech_win = 0 + detected_audio = [] + + last_chunk = np.zeros(self.window_size, dtype=np.float32) + + # Vad iterator needs to be reloaded because after running for a while, it freaks out and hallucinates speech. + vad_iter_reload_delay = 60 * 2 + vad_iter_load_time = time.time() + + + self.vad_iterator = self.VADIterator( + self.vad_model, + threshold = self.vad_threshold, + sampling_rate = SAMPLERATE, + min_silence_duration_ms = self.min_silence_duration_ms, + speech_pad_ms = self.speech_pad_ms + ) + + + while self.rec_flag: + chunk = np.frombuffer(self.stream_in.read(self.window_size), dtype=np.float32) + + speech_dict = self.vad_iterator(chunk) + + # check if speech_dict is {"start": x} ir {"end": x} + if speech_dict is not None: + self.speech = "start" in speech_dict + + if self.speech: + #print("Speech detected!") + if speech_win == 0: + detected_audio = last_chunk.tolist() + speech_win += 1 + detected_audio += chunk.tolist() + + else: + if time.time() - vad_iter_load_time > vad_iter_reload_delay: + self.vad_iterator.reset_states() + + vad_iter_load_time = time.time() + + self.vad_iterator = self.VADIterator( + self.vad_model, + threshold = self.vad_threshold, + sampling_rate = SAMPLERATE, + min_silence_duration_ms = self.min_silence_duration_ms, + speech_pad_ms = self.speech_pad_ms + ) + + print("Reloaded VADIterator!") + + if speech_win > 0: + speech_win = 0 + + self.audios_for_whisper.append(detected_audio) + + last_chunk = chunk.copy() + + + + + def start_vad_recorder(self, target_device_name, window_size_sec = 0.1, vad_threshold = 0.6, min_silence_duration_ms = 150, speech_pad_ms = 0): + + self.window_size = int(window_size_sec * SAMPLERATE) + + self.vad_threshold = vad_threshold + self.min_silence_duration_ms = min_silence_duration_ms + self.speech_pad_ms = speech_pad_ms + + + self.p = pyaudio.PyAudio() + + target_device_index = None + for i in range(self.p.get_device_count()): + device_info = self.p.get_device_info_by_index(i) + if device_info['maxInputChannels'] > 0 and target_device_name in device_info['name']: + target_device_index = i + break + + if target_device_index is None: + print(f"No target device found with \"{target_device_name}\" in its name.") + exit() + + try: + self.stream_in = self.p.open(format=pyaudio.paFloat32, channels=1, rate=SAMPLERATE, input=True, frames_per_buffer=self.window_size, input_device_index=target_device_index) + except OSError: + print(f"An unexpected error occured when trying to open device stream with \"{target_device_name}\" in its name. That could be caused by the device being disabled or unplugged.") + exit() + + self.speech = False + self.audios_for_whisper = [] + + + if self.vad_iterator is not None: + self.vad_iterator.reset_states() + + self.rec_flag = True + self.vad_rec_thread = threading.Thread(target=self._vad_recorder, daemon=True) + self.vad_rec_thread.start() + + def stop_vad_recorder(self): + self.rec_flag = False + self.vad_rec_thread.join() + + self.stream_in.stop_stream() + self.stream_in.close() + self.p.terminate() diff --git a/frontend/vad_recorder_test.ipynb b/frontend/vad_recorder_test.ipynb new file mode 100644 index 0000000..23aa2ee --- /dev/null +++ b/frontend/vad_recorder_test.ipynb @@ -0,0 +1,69 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from vad_recorder import VADRecorder\n", + "import time\n", + "\n", + "# Instantiate WhisperWrap with the target device name and other parameters\n", + "vad_rec = VADRecorder()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Starting VAD recording thread\")\n", + "\n", + "try:\n", + " vad_rec.start_vad_recorder(target_device_name=\"Virtual\")\n", + "\n", + "\n", + " recordings_count = 0\n", + " now_speech = False\n", + "\n", + " print(\"Done!\")\n", + " while True:\n", + " if vad_rec.speech != now_speech:\n", + " now_speech = vad_rec.speech\n", + " print(f\"Speech: {now_speech}\")\n", + " if len(vad_rec.audios_for_whisper) != recordings_count:\n", + " recordings_count = len(vad_rec.audios_for_whisper)\n", + " print(f\"Recordings count: {recordings_count}\")\n", + "\n", + " time.sleep(0.01)\n", + "\n", + "except (KeyboardInterrupt, SystemExit):\n", + " print(\"Cleaning up...\")\n", + " vad_rec.stop_vad_recorder()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/frontend/whisper_tts_test.ipynb b/frontend/whisper_tts_test.ipynb deleted file mode 100644 index f295e22..0000000 --- a/frontend/whisper_tts_test.ipynb +++ /dev/null @@ -1,126 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import whisper\n", - "from tts_stream import TTSstream" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import sounddevice as sd\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import torch\n", - "\n", - "# force matplotlib gui backend\n", - "import matplotlib\n", - "matplotlib.use('TkAgg')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Loading whisper model...\")\n", - "\n", - "if torch.cuda.is_available():\n", - " print(\"using CUDA\")\n", - " device = \"cuda\"\n", - "else:\n", - " print(\"using CPU\")\n", - " device = \"cpu\"\n", - "\n", - "model = whisper.load_model(\"medium\").to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tts = TTSstream(speaker_wav=\"voices/lector.wav\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# optional for changing speaker to some another one\n", - "tts.change_speaker(\"voices/speaker_name.wav\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# record 10 seconds of audio with sounddevice\n", - "print(\"Recording audio...\")\n", - "\n", - "fs = 16000\n", - "duration = 4\n", - "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n", - "sd.wait()\n", - "\n", - "\n", - "\n", - "frames = frames[:, 0]\n", - "#frames /= np.max(np.abs(frames))\n", - "\n", - "\n", - "## plot audio\n", - "#plt.plot(frames)\n", - "## set plot range to -1, 1\n", - "#plt.ylim(-1, 1)\n", - "#plt.show()\n", - "\n", - "# recognize text from audio\n", - "print(\"Recognizing text...\")\n", - "\n", - "result = model.transcribe(frames, language=\"pl\", fp16=False)\n", - "whisper_text = result[\"text\"]\n", - "print(whisper_text)\n", - "\n", - "# synthesize text to audio\n", - "print(\"Synthesizing audio...\")\n", - "tts.tts_speak(whisper_text)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}