Compare commits
No commits in common. "af6f2a13b7c976008e0d5936045ca6c1c89bf09a" and "c87486728d5570554d5f152e959c2a4961a864e9" have entirely different histories.
af6f2a13b7
...
c87486728d
6
.gitignore
vendored
6
.gitignore
vendored
@ -1,9 +1,3 @@
|
|||||||
finetune_dialogs_tool/temp/
|
finetune_dialogs_tool/temp/
|
||||||
finetune_dialogs_tool/output_dialogs/
|
finetune_dialogs_tool/output_dialogs/
|
||||||
finetune_dialogs_tool/__pycache__/
|
finetune_dialogs_tool/__pycache__/
|
||||||
|
|
||||||
frontend/voices/*
|
|
||||||
!frontend/voices/lector.wav
|
|
||||||
!frontend/voices/lector source.txt
|
|
||||||
|
|
||||||
frontend/__pycache__/
|
|
@ -1,2 +1 @@
|
|||||||
gradio3
|
gradio3
|
||||||
TTS
|
|
@ -1,55 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from tts_stream import TTSstream\n",
|
|
||||||
"\n",
|
|
||||||
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# optional for changing speaker to some another one\n",
|
|
||||||
"tts.change_speaker(\"voices/speaker_name.wav\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"tts.tts_speak(\"Testowanie syntezy naturalnego głosu za pomocą sztucznej sieci neuronowej.\")\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
@ -1,106 +0,0 @@
|
|||||||
import os
|
|
||||||
import torch
|
|
||||||
import pyaudio
|
|
||||||
from TTS.api import TTS
|
|
||||||
from TTS.tts.configs.xtts_config import XttsConfig
|
|
||||||
from TTS.tts.models.xtts import Xtts
|
|
||||||
from TTS.utils.generic_utils import get_user_data_dir
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Check if CUDA is available
|
|
||||||
if torch.cuda.is_available():
|
|
||||||
print("Using CUDA")
|
|
||||||
device = "cuda"
|
|
||||||
else:
|
|
||||||
print("Using CPU")
|
|
||||||
device = "cpu"
|
|
||||||
|
|
||||||
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
|
||||||
|
|
||||||
|
|
||||||
class TTSstream:
|
|
||||||
def __init__(self, speaker_wav):
|
|
||||||
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
|
||||||
|
|
||||||
#print(model_path)
|
|
||||||
#
|
|
||||||
# download model if it doesn't exist
|
|
||||||
if not os.path.exists(os.path.join(model_path, "config.json")):
|
|
||||||
print("Downloading model...")
|
|
||||||
tts = TTS()
|
|
||||||
tts.download_model_by_name(model_name=model_name)
|
|
||||||
|
|
||||||
config = XttsConfig()
|
|
||||||
config.load_json(os.path.join(model_path, "config.json"))
|
|
||||||
self.model = Xtts.init_from_config(config)
|
|
||||||
self.model.load_checkpoint(
|
|
||||||
config,
|
|
||||||
checkpoint_path=os.path.join(model_path, "model.pth"),
|
|
||||||
vocab_path=os.path.join(model_path, "vocab.json"),
|
|
||||||
eval=True,
|
|
||||||
use_deepspeed=False
|
|
||||||
)
|
|
||||||
self.model.to(device)
|
|
||||||
|
|
||||||
|
|
||||||
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
|
|
||||||
|
|
||||||
def change_speaker(self, speaker_wav):
|
|
||||||
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)
|
|
||||||
|
|
||||||
def _write_stream(self):
|
|
||||||
# play first play_buffer_size samples and remove them from the buffer
|
|
||||||
while True:
|
|
||||||
if len(self.chunks_bin) > 0:
|
|
||||||
self.chunk = self.chunks_bin[:self.play_buffer_size]
|
|
||||||
self.chunks_bin = self.chunks_bin[self.play_buffer_size:]
|
|
||||||
self.stream.write(self.chunk)
|
|
||||||
else:
|
|
||||||
if self.all_done:
|
|
||||||
break
|
|
||||||
time.sleep(0.01)
|
|
||||||
|
|
||||||
|
|
||||||
def tts_speak(self, text):
|
|
||||||
self.play_buffer_size = 512
|
|
||||||
|
|
||||||
chunks = self.model.inference_stream(
|
|
||||||
text,
|
|
||||||
"pl",
|
|
||||||
self.gpt_cond_latent,
|
|
||||||
self.speaker_embedding,
|
|
||||||
stream_chunk_size=20,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# open pyaudio stream
|
|
||||||
p = pyaudio.PyAudio()
|
|
||||||
self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)
|
|
||||||
|
|
||||||
|
|
||||||
self.chunks_bin = b""
|
|
||||||
self.all_done = False
|
|
||||||
|
|
||||||
# run write_stream as thread
|
|
||||||
thread = threading.Thread(target=self._write_stream)
|
|
||||||
thread.start()
|
|
||||||
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
# read chunks from chunks generator as they are generated
|
|
||||||
for self.chunk in chunks:
|
|
||||||
self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
|
|
||||||
break
|
|
||||||
# some weird error caused by coqui-tts
|
|
||||||
except:
|
|
||||||
print("Error occured when generating audio stream. Retrying...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.all_done = True
|
|
||||||
|
|
||||||
# wait for thread to finish
|
|
||||||
thread.join()
|
|
||||||
|
|
||||||
self.stream.close()
|
|
||||||
p.terminate()
|
|
@ -1 +0,0 @@
|
|||||||
https://opengameart.org/content/voiceover-pack-fighter-40-taunts
|
|
Binary file not shown.
@ -1,126 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import whisper\n",
|
|
||||||
"from tts_stream import TTSstream"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import sounddevice as sd\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import matplotlib.pyplot as plt\n",
|
|
||||||
"import torch\n",
|
|
||||||
"\n",
|
|
||||||
"# force matplotlib gui backend\n",
|
|
||||||
"import matplotlib\n",
|
|
||||||
"matplotlib.use('TkAgg')\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"print(\"Loading whisper model...\")\n",
|
|
||||||
"\n",
|
|
||||||
"if torch.cuda.is_available():\n",
|
|
||||||
" print(\"using CUDA\")\n",
|
|
||||||
" device = \"cuda\"\n",
|
|
||||||
"else:\n",
|
|
||||||
" print(\"using CPU\")\n",
|
|
||||||
" device = \"cpu\"\n",
|
|
||||||
"\n",
|
|
||||||
"model = whisper.load_model(\"medium\").to(device)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"tts = TTSstream(speaker_wav=\"voices/lector.wav\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# optional for changing speaker to some another one\n",
|
|
||||||
"tts.change_speaker(\"voices/speaker_name.wav\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# record 10 seconds of audio with sounddevice\n",
|
|
||||||
"print(\"Recording audio...\")\n",
|
|
||||||
"\n",
|
|
||||||
"fs = 16000\n",
|
|
||||||
"duration = 4\n",
|
|
||||||
"frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n",
|
|
||||||
"sd.wait()\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"frames = frames[:, 0]\n",
|
|
||||||
"#frames /= np.max(np.abs(frames))\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"## plot audio\n",
|
|
||||||
"#plt.plot(frames)\n",
|
|
||||||
"## set plot range to -1, 1\n",
|
|
||||||
"#plt.ylim(-1, 1)\n",
|
|
||||||
"#plt.show()\n",
|
|
||||||
"\n",
|
|
||||||
"# recognize text from audio\n",
|
|
||||||
"print(\"Recognizing text...\")\n",
|
|
||||||
"\n",
|
|
||||||
"result = model.transcribe(frames, language=\"pl\", fp16=False)\n",
|
|
||||||
"whisper_text = result[\"text\"]\n",
|
|
||||||
"print(whisper_text)\n",
|
|
||||||
"\n",
|
|
||||||
"# synthesize text to audio\n",
|
|
||||||
"print(\"Synthesizing audio...\")\n",
|
|
||||||
"tts.tts_speak(whisper_text)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.6"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user