{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import whisper\n", "from tts_stream import TTSstream" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sounddevice as sd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import torch\n", "\n", "# force matplotlib gui backend\n", "import matplotlib\n", "matplotlib.use('TkAgg')\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Loading whisper model...\")\n", "\n", "if torch.cuda.is_available():\n", " print(\"using CUDA\")\n", " device = \"cuda\"\n", "else:\n", " print(\"using CPU\")\n", " device = \"cpu\"\n", "\n", "model = whisper.load_model(\"medium\").to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tts = TTSstream(speaker_wav=\"voices/lector.wav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# optional for changing speaker to some another one\n", "tts.change_speaker(\"voices/speaker_name.wav\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# record 10 seconds of audio with sounddevice\n", "print(\"Recording audio...\")\n", "\n", "fs = 16000\n", "duration = 4\n", "frames = sd.rec(int(duration * fs), samplerate=fs, channels=1)\n", "sd.wait()\n", "\n", "\n", "\n", "frames = frames[:, 0]\n", "#frames /= np.max(np.abs(frames))\n", "\n", "\n", "## plot audio\n", "#plt.plot(frames)\n", "## set plot range to -1, 1\n", "#plt.ylim(-1, 1)\n", "#plt.show()\n", "\n", "# recognize text from audio\n", "print(\"Recognizing text...\")\n", "\n", "result = model.transcribe(frames, language=\"pl\", fp16=False)\n", "whisper_text = result[\"text\"]\n", "print(whisper_text)\n", "\n", "# synthesize text to audio\n", "print(\"Synthesizing audio...\")\n", "tts.tts_speak(whisper_text)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }