from vad_recorder import VADRecorder from tts_stream import TTSStream from chatgpt_wrap import ChatGPTWrap from faster_whisper import WhisperModel import torch import time import numpy as np class Core: def __init__(self, whisper_model_name = "large-v3", use_chatgpt_placeholder = False): self.use_chatgpt_placeholder = use_chatgpt_placeholder self.device = "cuda" if torch.cuda.is_available() else "cpu" print("\n=======================================") print(f"Using {self.device.capitalize()} for:") print(" - Faster Whisper") print(" - TTS") print("=======================================\n") print("Loading Faster Whisper model... ", end="") self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16") print("Done!") # VADRecorder, TTSStream and ChatGPTWrap have their own console loading messages self.vad_rec = VADRecorder() self.tts = TTSStream(device=self.device) self.gpt_wrap = ChatGPTWrap(use_chatgpt_placeholder) def set_order_settings(self, phone_number, order_items, delivery_address, payment_method): self.phone_number = phone_number self.order_items = order_items self.delivery_address = delivery_address self.payment_method = payment_method def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms): self.speech_recog_timeout = speech_recog_timeout self.audio_input_device_name = audio_input_device_name self.audio_output_device_name = audio_output_device_name self.window_size_sec = window_size_sec self.vad_threshold = vad_threshold self.min_silence_duration_ms = min_silence_duration_ms self.speech_pad_ms = speech_pad_ms def set_tts_settings(self, speaker_wav): self.speaker_wav = speaker_wav def assistant_start(self): print("Starting assistant...") print("Setting TTS speaker... ", end="") self.tts.change_speaker(self.speaker_wav) print("Done!") print("Starting VAD recording thread... ", end="") self.vad_rec.start_vad_recorder( target_device_name = self.audio_input_device_name, window_size_sec = self.window_size_sec, vad_threshold = self.vad_threshold, min_silence_duration_ms = self.min_silence_duration_ms, speech_pad_ms = self.speech_pad_ms ) print("Done!") self.gpt_wrap.init_order( self.phone_number, self.order_items, self.delivery_address, self.payment_method ) print("LISTENING!!!") last_recog_time = time.perf_counter() speech_recog_text = "" self.assistant_running = True while self.assistant_running: if self.vad_rec.speech: last_recog_time = time.perf_counter() if len(self.vad_rec.audios_for_whisper) > 0: #stream_out.write(audios_for_whisper.pop(0)) audio = np.array(self.vad_rec.audios_for_whisper.pop(0), dtype=np.float32) segments, _ = self.whisper_model.transcribe(audio, language="pl") if not self.assistant_running: break text = "".join([segment.text for segment in segments]) #speech_recog_text += " " if len(speech_recog_text) else "" + text if len(text) == 0: continue if not text[-1] in ".,!?": text += "." speech_recog_text += text.strip() + "\n" print("=========================================") print(text) last_recog_time = time.perf_counter() elif time.perf_counter() - last_recog_time > self.speech_recog_timeout and len(speech_recog_text) > 0: speech_recog_text = speech_recog_text.strip() print("=========================================\n\n") print("-----------------------------------------") print("!!!!!!!!!! SENDING TO CHATGPT !!!!!!!!!!!") print("-----------------------------------------") print(speech_recog_text) print("-----------------------------------------\n\n") gpt_response = self.gpt_wrap.get_response(speech_recog_text) print("-----------------------------------------") if self.use_chatgpt_placeholder: print("!!!!! CHATGPT PLACEHOLDER RESPONSE !!!!!!") else: print("!!!!!!!!!!! CHATGPT RESPONSE !!!!!!!!!!!!") print("-----------------------------------------") print(gpt_response) print("-----------------------------------------\n\n") if not self.assistant_running: break speech_recog_text = "" # tts print("Speech synthesis stream started!") self.tts.tts_speak(gpt_response.replace(" CALLEND", "")) if "CALLEND" in gpt_response: self.assistant_stop() #print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text)) time.sleep(0.01) # set assistant_running back to True to indicate that the loop has exited def assistant_stop(self): print("Stopping assistant... ", end="") self.assistant_running = False self.vad_rec.stop_vad_recorder() print("Done!")