diff --git a/.gitignore b/.gitignore index 60ea2fd..b35bfd8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ finetune_dialogs_tool/temp/ finetune_dialogs_tool/output_dialogs/ -finetune_dialogs_tool/__pycache__/ \ No newline at end of file +finetune_dialogs_tool/__pycache__/ + +frontend/voices/* +!frontend/voices/lector.wav +!frontend/voices/lector source.txt + +frontend/__pycache__/ \ No newline at end of file diff --git a/frontend/tts test.ipynb b/frontend/tts test.ipynb new file mode 100644 index 0000000..41f3484 --- /dev/null +++ b/frontend/tts test.ipynb @@ -0,0 +1,55 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from tts_stream import TTSstream\n", + "\n", + "tts = TTSstream(speaker_wav=\"voices/lector.wav\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# optional for changing speaker to some another one\n", + "tts.change_speaker(\"voices/speaker_name.wav\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/frontend/tts_stream.py b/frontend/tts_stream.py new file mode 100644 index 0000000..0d7ab08 --- /dev/null +++ b/frontend/tts_stream.py @@ -0,0 +1,106 @@ +import os +import torch +import pyaudio +from TTS.api import TTS +from TTS.tts.configs.xtts_config import XttsConfig +from TTS.tts.models.xtts import Xtts +from TTS.utils.generic_utils import get_user_data_dir +import threading +import time + +# Check if CUDA is available +if torch.cuda.is_available(): + print("Using CUDA") + device = "cuda" +else: + print("Using CPU") + device = "cpu" + +model_name = "tts_models/multilingual/multi-dataset/xtts_v2" + + +class TTSstream: + def __init__(self, speaker_wav): + model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) + + #print(model_path) +# + # download model if it doesn't exist + if not os.path.exists(os.path.join(model_path, "config.json")): + print("Downloading model...") + tts = TTS() + tts.download_model_by_name(model_name=model_name) + + config = XttsConfig() + config.load_json(os.path.join(model_path, "config.json")) + self.model = Xtts.init_from_config(config) + self.model.load_checkpoint( + config, + checkpoint_path=os.path.join(model_path, "model.pth"), + vocab_path=os.path.join(model_path, "vocab.json"), + eval=True, + use_deepspeed=False + ) + self.model.to(device) + + + self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav) + + def change_speaker(self, speaker_wav): + self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav) + + def _write_stream(self): + # play first play_buffer_size samples and remove them from the buffer + while True: + if len(self.chunks_bin) > 0: + self.chunk = self.chunks_bin[:self.play_buffer_size] + self.chunks_bin = self.chunks_bin[self.play_buffer_size:] + self.stream.write(self.chunk) + else: + if self.all_done: + break + time.sleep(0.01) + + + def tts_speak(self, text): + self.play_buffer_size = 512 + + chunks = self.model.inference_stream( + text, + "pl", + self.gpt_cond_latent, + self.speaker_embedding, + stream_chunk_size=20, + ) + + + # open pyaudio stream + p = pyaudio.PyAudio() + self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True) + + + self.chunks_bin = b"" + self.all_done = False + + # run write_stream as thread + thread = threading.Thread(target=self._write_stream) + thread.start() + + while True: + try: + # read chunks from chunks generator as they are generated + for self.chunk in chunks: + self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes() + break + # some weird error caused by coqui-tts + except: + print("Error occured when generating audio stream. Retrying...") + continue + + self.all_done = True + + # wait for thread to finish + thread.join() + + self.stream.close() + p.terminate() diff --git a/frontend/voices/lector source.txt b/frontend/voices/lector source.txt new file mode 100644 index 0000000..30d152d --- /dev/null +++ b/frontend/voices/lector source.txt @@ -0,0 +1 @@ +https://opengameart.org/content/voiceover-pack-fighter-40-taunts \ No newline at end of file diff --git a/frontend/voices/lector.wav b/frontend/voices/lector.wav new file mode 100644 index 0000000..68aa0ed Binary files /dev/null and b/frontend/voices/lector.wav differ diff --git a/frontend/whisper_tts_test.ipynb b/frontend/whisper_tts_test.ipynb new file mode 100644 index 0000000..f295e22 --- /dev/null +++ b/frontend/whisper_tts_test.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import whisper\n", + "from tts_stream import TTSstream" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sounddevice as sd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "\n", + "# force matplotlib gui backend\n", + "import matplotlib\n", + "matplotlib.use('TkAgg')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Loading whisper model...\")\n", + "\n", + "if torch.cuda.is_available():\n", + " print(\"using CUDA\")\n", + " device = \"cuda\"\n", + "else:\n", + " print(\"using CPU\")\n", + " device = \"cpu\"\n", + "\n", + "model = whisper.load_model(\"medium\").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tts = TTSstream(speaker_wav=\"voices/lector.wav\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# optional for changing speaker to some another one\n", + "tts.change_speaker(\"voices/speaker_name.wav\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# record 10 seconds of audio with sounddevice\n", + "print(\"Recording audio...\")\n", + "\n", + "fs = 16000\n", + "duration = 4\n", + "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n", + "sd.wait()\n", + "\n", + "\n", + "\n", + "frames = frames[:, 0]\n", + "#frames /= np.max(np.abs(frames))\n", + "\n", + "\n", + "## plot audio\n", + "#plt.plot(frames)\n", + "## set plot range to -1, 1\n", + "#plt.ylim(-1, 1)\n", + "#plt.show()\n", + "\n", + "# recognize text from audio\n", + "print(\"Recognizing text...\")\n", + "\n", + "result = model.transcribe(frames, language=\"pl\", fp16=False)\n", + "whisper_text = result[\"text\"]\n", + "print(whisper_text)\n", + "\n", + "# synthesize text to audio\n", + "print(\"Synthesizing audio...\")\n", + "tts.tts_speak(whisper_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}