big tts and speech recognition update

2023-11-15 19:57:17 +01:00 · 2023-11-15 19:57:17 +01:00 · 6a24bee99b
commit 6a24bee99b
parent c87486728d
6 changed files with 295 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,9 @@
 finetune_dialogs_tool/temp/
 finetune_dialogs_tool/output_dialogs/
 finetune_dialogs_tool/__pycache__/
+
+frontend/voices/*
+!frontend/voices/lector.wav
+!frontend/voices/lector source.txt
+
+frontend/__pycache__/
--- a/frontend/tts
+++ b/frontend/tts
@ -0,0 +1,55 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tts_stream import TTSstream\n",
+    "\n",
+    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# optional for changing speaker to some another one\n",
+    "tts.change_speaker(\"voices/speaker_name.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/frontend/tts_stream.py
+++ b/frontend/tts_stream.py
@ -0,0 +1,106 @@
+import os
+import torch
+import pyaudio
+from TTS.api import TTS
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.utils.generic_utils import get_user_data_dir
+import threading
+import time
+
+# Check if CUDA is available
+if torch.cuda.is_available():
+    print("Using CUDA")
+    device = "cuda"
+else:
+    print("Using CPU")
+    device = "cpu"
+
+model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+
+
+class TTSstream:
+    def __init__(self, speaker_wav):
+        model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
+
+        #print(model_path)
+#
+        # download model if it doesn't exist
+        if not os.path.exists(os.path.join(model_path, "config.json")):
+            print("Downloading model...")
+            tts = TTS()
+            tts.download_model_by_name(model_name=model_name)
+
+        config = XttsConfig()
+        config.load_json(os.path.join(model_path, "config.json"))
+        self.model = Xtts.init_from_config(config)
+        self.model.load_checkpoint(
+            config,
+            checkpoint_path=os.path.join(model_path, "model.pth"),
+            vocab_path=os.path.join(model_path, "vocab.json"),
+            eval=True,
+            use_deepspeed=False
+        )
+        self.model.to(device)
+
+
+        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+
+    def change_speaker(self, speaker_wav):
+        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
+
+    def _write_stream(self):
+        # play first play_buffer_size samples and remove them from the buffer
+        while True:
+            if len(self.chunks_bin) > 0:
+                self.chunk = self.chunks_bin[:self.play_buffer_size]
+                self.chunks_bin = self.chunks_bin[self.play_buffer_size:]
+                self.stream.write(self.chunk)
+            else:
+                if self.all_done:
+                    break
+                time.sleep(0.01)
+
+
+    def tts_speak(self, text):
+        self.play_buffer_size = 512
+
+        chunks = self.model.inference_stream(
+            text,
+            "pl",
+            self.gpt_cond_latent,
+            self.speaker_embedding,
+            stream_chunk_size=20,
+        )
+
+
+        # open pyaudio stream
+        p = pyaudio.PyAudio()
+        self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)
+
+
+        self.chunks_bin = b""
+        self.all_done = False
+
+        # run write_stream as thread
+        thread = threading.Thread(target=self._write_stream)
+        thread.start()
+
+        while True:
+            try:
+                # read chunks from chunks generator as they are generated
+                for self.chunk in chunks:
+                    self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
+                break
+            # some weird error caused by coqui-tts
+            except:
+                print("Error occured when generating audio stream. Retrying...")
+                continue
+
+        self.all_done = True
+
+        # wait for thread to finish
+        thread.join()
+
+        self.stream.close()
+        p.terminate()
--- a/frontend/voices/lector
+++ b/frontend/voices/lector
@ -0,0 +1 @@
+https://opengameart.org/content/voiceover-pack-fighter-40-taunts
--- a/frontend/voices/lector.wav
+++ b/frontend/voices/lector.wav
--- a/frontend/whisper_tts_test.ipynb
+++ b/frontend/whisper_tts_test.ipynb
@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import whisper\n",
+    "from tts_stream import TTSstream"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sounddevice as sd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "\n",
+    "# force matplotlib gui backend\n",
+    "import matplotlib\n",
+    "matplotlib.use('TkAgg')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Loading whisper model...\")\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    print(\"using CUDA\")\n",
+    "    device = \"cuda\"\n",
+    "else:\n",
+    "    print(\"using CPU\")\n",
+    "    device = \"cpu\"\n",
+    "\n",
+    "model = whisper.load_model(\"medium\").to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# optional for changing speaker to some another one\n",
+    "tts.change_speaker(\"voices/speaker_name.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# record 10 seconds of audio with sounddevice\n",
+    "print(\"Recording audio...\")\n",
+    "\n",
+    "fs = 16000\n",
+    "duration = 4\n",
+    "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
+    "sd.wait()\n",
+    "\n",
+    "\n",
+    "\n",
+    "frames = frames[:, 0]\n",
+    "#frames /= np.max(np.abs(frames))\n",
+    "\n",
+    "\n",
+    "## plot audio\n",
+    "#plt.plot(frames)\n",
+    "## set plot range to -1, 1\n",
+    "#plt.ylim(-1, 1)\n",
+    "#plt.show()\n",
+    "\n",
+    "# recognize text from audio\n",
+    "print(\"Recognizing text...\")\n",
+    "\n",
+    "result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
+    "whisper_text = result[\"text\"]\n",
+    "print(whisper_text)\n",
+    "\n",
+    "# synthesize text to audio\n",
+    "print(\"Synthesizing audio...\")\n",
+    "tts.tts_speak(whisper_text)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
				`@ -0,0 +1 @@`
				`https://opengameart.org/content/voiceover-pack-fighter-40-taunts`