2023-12-01 21:48:56 +01:00
|
|
|
from vad_recorder import VADRecorder
|
|
|
|
from tts_stream import TTSStream
|
2023-12-02 17:32:01 +01:00
|
|
|
from chatgpt_wrap import ChatGPTWrap
|
2023-12-01 21:48:56 +01:00
|
|
|
from faster_whisper import WhisperModel
|
|
|
|
import torch
|
|
|
|
import time
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Core:
|
2023-12-02 17:32:01 +01:00
|
|
|
def __init__(self, whisper_model_name = "large-v3", use_chatgpt_placeholder = False):
|
|
|
|
self.use_chatgpt_placeholder = use_chatgpt_placeholder
|
|
|
|
|
2023-12-01 21:48:56 +01:00
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
print("\n=======================================")
|
|
|
|
print(f"Using {self.device.capitalize()} for:")
|
|
|
|
print(" - Faster Whisper")
|
|
|
|
print(" - TTS")
|
|
|
|
print("=======================================\n")
|
|
|
|
|
2023-12-02 17:32:01 +01:00
|
|
|
print("Loading Faster Whisper model... ", end="")
|
2023-12-01 21:48:56 +01:00
|
|
|
self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16")
|
|
|
|
print("Done!")
|
|
|
|
|
2023-12-02 17:32:01 +01:00
|
|
|
# VADRecorder, TTSStream and ChatGPTWrap have their own console loading messages
|
2023-12-01 21:48:56 +01:00
|
|
|
self.vad_rec = VADRecorder()
|
|
|
|
self.tts = TTSStream(device=self.device)
|
2023-12-02 17:32:01 +01:00
|
|
|
self.gpt_wrap = ChatGPTWrap(use_chatgpt_placeholder)
|
2023-12-01 21:48:56 +01:00
|
|
|
|
|
|
|
|
2023-12-02 17:32:01 +01:00
|
|
|
def set_order_settings(self, phone_number, order_items, delivery_address, payment_method):
|
2023-12-01 21:48:56 +01:00
|
|
|
self.phone_number = phone_number
|
|
|
|
self.order_items = order_items
|
|
|
|
self.delivery_address = delivery_address
|
2023-12-02 17:32:01 +01:00
|
|
|
self.payment_method = payment_method
|
2023-12-01 21:48:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms):
|
|
|
|
self.speech_recog_timeout = speech_recog_timeout
|
|
|
|
self.audio_input_device_name = audio_input_device_name
|
|
|
|
self.audio_output_device_name = audio_output_device_name
|
|
|
|
self.window_size_sec = window_size_sec
|
|
|
|
self.vad_threshold = vad_threshold
|
|
|
|
self.min_silence_duration_ms = min_silence_duration_ms
|
|
|
|
self.speech_pad_ms = speech_pad_ms
|
|
|
|
|
|
|
|
|
|
|
|
def set_tts_settings(self, speaker_wav):
|
|
|
|
self.speaker_wav = speaker_wav
|
|
|
|
|
|
|
|
|
|
|
|
def assistant_start(self):
|
|
|
|
print("Starting assistant...")
|
|
|
|
|
|
|
|
print("Setting TTS speaker... ", end="")
|
|
|
|
self.tts.change_speaker(self.speaker_wav)
|
|
|
|
print("Done!")
|
|
|
|
|
|
|
|
print("Starting VAD recording thread... ", end="")
|
|
|
|
self.vad_rec.start_vad_recorder(
|
|
|
|
target_device_name = self.audio_input_device_name,
|
|
|
|
window_size_sec = self.window_size_sec,
|
|
|
|
vad_threshold = self.vad_threshold,
|
|
|
|
min_silence_duration_ms = self.min_silence_duration_ms,
|
|
|
|
speech_pad_ms = self.speech_pad_ms
|
|
|
|
)
|
|
|
|
print("Done!")
|
|
|
|
|
2023-12-02 17:32:01 +01:00
|
|
|
self.gpt_wrap.init_order(
|
|
|
|
self.phone_number,
|
|
|
|
self.order_items,
|
|
|
|
self.delivery_address,
|
|
|
|
self.payment_method
|
|
|
|
)
|
2023-12-01 21:48:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("LISTENING!!!")
|
|
|
|
|
|
|
|
|
|
|
|
last_recog_time = time.perf_counter()
|
|
|
|
speech_recog_text = ""
|
|
|
|
|
|
|
|
self.assistant_running = True
|
|
|
|
|
|
|
|
while self.assistant_running:
|
|
|
|
if self.vad_rec.speech:
|
|
|
|
last_recog_time = time.perf_counter()
|
|
|
|
|
|
|
|
if len(self.vad_rec.audios_for_whisper) > 0:
|
|
|
|
#stream_out.write(audios_for_whisper.pop(0))
|
|
|
|
audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32)
|
|
|
|
|
|
|
|
segments, _ = self.whisper_model.transcribe(audio, language="pl")
|
|
|
|
if not self.assistant_running:
|
|
|
|
break
|
|
|
|
|
|
|
|
text = "".join([segment.text for segment in segments])
|
|
|
|
#speech_recog_text += " " if len(speech_recog_text) else "" + text
|
|
|
|
|
|
|
|
if len(text) == 0:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if not text[-1] in ".,!?":
|
|
|
|
text += "."
|
|
|
|
|
|
|
|
speech_recog_text += text.strip() + "\n"
|
|
|
|
|
|
|
|
print("=========================================")
|
|
|
|
print(text)
|
|
|
|
|
|
|
|
last_recog_time = time.perf_counter()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0:
|
|
|
|
speech_recog_text = speech_recog_text.strip()
|
|
|
|
|
|
|
|
print("=========================================\n\n")
|
|
|
|
print("-----------------------------------------")
|
|
|
|
print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!")
|
|
|
|
print("-----------------------------------------")
|
|
|
|
print(speech_recog_text)
|
|
|
|
print("-----------------------------------------\n\n")
|
|
|
|
|
2023-12-02 17:32:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
gpt_response = self.gpt_wrap.get_response(speech_recog_text)
|
|
|
|
|
|
|
|
print("-----------------------------------------")
|
|
|
|
if self.use_chatgpt_placeholder:
|
|
|
|
print("!!!!! CHATGPT PLACEHOLDER RESPONSE !!!!!!")
|
|
|
|
else:
|
|
|
|
print("!!!!!!!!!!! CHATGPT RESPONSE !!!!!!!!!!!!")
|
|
|
|
print("-----------------------------------------")
|
|
|
|
print(gpt_response)
|
|
|
|
print("-----------------------------------------\n\n")
|
2023-12-01 21:48:56 +01:00
|
|
|
|
|
|
|
if not self.assistant_running:
|
|
|
|
break
|
|
|
|
|
|
|
|
speech_recog_text = ""
|
|
|
|
|
|
|
|
|
|
|
|
# tts
|
|
|
|
print("Speech synthesis stream started!")
|
2023-12-02 17:32:01 +01:00
|
|
|
self.tts.tts_speak(gpt_response.replace(" CALLEND", ""))
|
|
|
|
|
|
|
|
|
|
|
|
if "CALLEND" in gpt_response:
|
|
|
|
self.assistant_stop()
|
2023-12-01 21:48:56 +01:00
|
|
|
|
|
|
|
|
|
|
|
#print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text))
|
|
|
|
|
|
|
|
|
|
|
|
time.sleep(0.01)
|
|
|
|
|
|
|
|
# set assistant_running back to True to indicate that the loop has exited
|
|
|
|
|
|
|
|
def assistant_stop(self):
|
|
|
|
print("Stopping assistant... ", end="")
|
|
|
|
|
|
|
|
self.assistant_running = False
|
|
|
|
|
|
|
|
self.vad_rec.stop_vad_recorder()
|
|
|
|
|
|
|
|
print("Done!")
|
|
|
|
|
|
|
|
|