7 changed files with 2 additions and 297 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,9 +1,3 @@
 finetune_dialogs_tool/temp/
 finetune_dialogs_tool/output_dialogs/
 finetune_dialogs_tool/__pycache__/
 frontend/voices/*
 !frontend/voices/lector.wav
 !frontend/voices/lector source.txt
 frontend/__pycache__/
--- a/frontend/requirements.txt
+++ b/frontend/requirements.txt
@ -1,2 +1 @@
 gradio3
 TTS
--- a/frontend/tts
+++ b/frontend/tts
@ -1,55 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tts_stream import TTSstream\n",
    "\n",
    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# optional for changing speaker to some another one\n",
    "tts.change_speaker(\"voices/speaker_name.wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/frontend/tts_stream.py
+++ b/frontend/tts_stream.py
@ -1,106 +0,0 @@
 import os
 import torch
 import pyaudio
 from TTS.api import TTS
 from TTS.tts.configs.xtts_config import XttsConfig
 from TTS.tts.models.xtts import Xtts
 from TTS.utils.generic_utils import get_user_data_dir
 import threading
 import time
 # Check if CUDA is available
 if torch.cuda.is_available():
    print("Using CUDA")
    device = "cuda"
 else:
    print("Using CPU")
    device = "cpu"
 model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
 class TTSstream:
    def __init__(self, speaker_wav):
        model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
        #print(model_path)
 #
        # download model if it doesn't exist
        if not os.path.exists(os.path.join(model_path, "config.json")):
            print("Downloading model...")
            tts = TTS()
            tts.download_model_by_name(model_name=model_name)
        config = XttsConfig()
        config.load_json(os.path.join(model_path, "config.json"))
        self.model = Xtts.init_from_config(config)
        self.model.load_checkpoint(
            config,
            checkpoint_path=os.path.join(model_path, "model.pth"),
            vocab_path=os.path.join(model_path, "vocab.json"),
            eval=True,
            use_deepspeed=False
        )
        self.model.to(device)
        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
    def change_speaker(self, speaker_wav):
        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
    def _write_stream(self):
        # play first play_buffer_size samples and remove them from the buffer
        while True:
            if len(self.chunks_bin) > 0:
                self.chunk = self.chunks_bin[:self.play_buffer_size]
                self.chunks_bin = self.chunks_bin[self.play_buffer_size:]
                self.stream.write(self.chunk)
            else:
                if self.all_done:
                    break
                time.sleep(0.01)
    def tts_speak(self, text):
        self.play_buffer_size = 512
        chunks = self.model.inference_stream(
            text,
            "pl",
            self.gpt_cond_latent,
            self.speaker_embedding,
            stream_chunk_size=20,
        )
        # open pyaudio stream
        p = pyaudio.PyAudio()
        self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)
        self.chunks_bin = b""
        self.all_done = False
        # run write_stream as thread
        thread = threading.Thread(target=self._write_stream)
        thread.start()
        while True:
            try:
                # read chunks from chunks generator as they are generated
                for self.chunk in chunks:
                    self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
                break
            # some weird error caused by coqui-tts
            except:
                print("Error occured when generating audio stream. Retrying...")
                continue
        self.all_done = True
        # wait for thread to finish
        thread.join()
        self.stream.close()
        p.terminate()
--- a/frontend/voices/lector
+++ b/frontend/voices/lector
@ -1 +0,0 @@
 https://opengameart.org/content/voiceover-pack-fighter-40-taunts
--- a/frontend/voices/lector.wav
+++ b/frontend/voices/lector.wav
--- a/frontend/whisper_tts_test.ipynb
+++ b/frontend/whisper_tts_test.ipynb
@ -1,126 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import whisper\n",
    "from tts_stream import TTSstream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sounddevice as sd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import torch\n",
    "\n",
    "# force matplotlib gui backend\n",
    "import matplotlib\n",
    "matplotlib.use('TkAgg')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Loading whisper model...\")\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    print(\"using CUDA\")\n",
    "    device = \"cuda\"\n",
    "else:\n",
    "    print(\"using CPU\")\n",
    "    device = \"cpu\"\n",
    "\n",
    "model = whisper.load_model(\"medium\").to(device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# optional for changing speaker to some another one\n",
    "tts.change_speaker(\"voices/speaker_name.wav\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# record 10 seconds of audio with sounddevice\n",
    "print(\"Recording audio...\")\n",
    "\n",
    "fs = 16000\n",
    "duration = 4\n",
    "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
    "sd.wait()\n",
    "\n",
    "\n",
    "\n",
    "frames = frames[:, 0]\n",
    "#frames /= np.max(np.abs(frames))\n",
    "\n",
    "\n",
    "## plot audio\n",
    "#plt.plot(frames)\n",
    "## set plot range to -1, 1\n",
    "#plt.ylim(-1, 1)\n",
    "#plt.show()\n",
    "\n",
    "# recognize text from audio\n",
    "print(\"Recognizing text...\")\n",
    "\n",
    "result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
    "whisper_text = result[\"text\"]\n",
    "print(whisper_text)\n",
    "\n",
    "# synthesize text to audio\n",
    "print(\"Synthesizing audio...\")\n",
    "tts.tts_speak(whisper_text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
		`@ -1 +0,0 @@`
			`https://opengameart.org/content/voiceover-pack-fighter-40-taunts`