PizzAI/frontend/tts_stream.py

import os
import torch
import pyaudio
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
import threading
import time

# Check if CUDA is available
if torch.cuda.is_available():
    print("Using CUDA")
    device = "cuda"
else:
    print("Using CPU")
    device = "cpu"

model_name = "tts_models/multilingual/multi-dataset/xtts_v2"


class TTSstream:
    def __init__(self, speaker_wav):
        model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))

        #print(model_path)
#
        # download model if it doesn't exist
        if not os.path.exists(os.path.join(model_path, "config.json")):
            print("Downloading model...")
            tts = TTS()
            tts.download_model_by_name(model_name=model_name)

        config = XttsConfig()
        config.load_json(os.path.join(model_path, "config.json"))
        self.model = Xtts.init_from_config(config)
        self.model.load_checkpoint(
            config,
            checkpoint_path=os.path.join(model_path, "model.pth"),
            vocab_path=os.path.join(model_path, "vocab.json"),
            eval=True,
            use_deepspeed=False
        )
        self.model.to(device)


        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)

    def change_speaker(self, speaker_wav):
        self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(audio_path=speaker_wav)

    def _write_stream(self):
        # play first play_buffer_size samples and remove them from the buffer
        while True:
            if len(self.chunks_bin) > 0:
                self.chunk = self.chunks_bin[:self.play_buffer_size]
                self.chunks_bin = self.chunks_bin[self.play_buffer_size:]
                self.stream.write(self.chunk)
            else:
                if self.all_done:
                    break
                time.sleep(0.01)


    def tts_speak(self, text):
        self.play_buffer_size = 512

        chunks = self.model.inference_stream(
            text,
            "pl",
            self.gpt_cond_latent,
            self.speaker_embedding,
            stream_chunk_size=20,
        )


        # open pyaudio stream
        p = pyaudio.PyAudio()
        self.stream = p.open(format=pyaudio.paFloat32, channels=1, rate=24000, output=True)


        self.chunks_bin = b""
        self.all_done = False

        # run write_stream as thread
        thread = threading.Thread(target=self._write_stream)
        thread.start()

        while True:
            try:
                # read chunks from chunks generator as they are generated
                for self.chunk in chunks:
                    self.chunks_bin += self.chunk.cpu().numpy().astype("float32").tobytes()
                break
            # some weird error caused by coqui-tts
            except:
                print("Error occured when generating audio stream. Retrying...")
                continue

        self.all_done = True

        # wait for thread to finish
        thread.join()

        self.stream.close()
        p.terminate()