diff --git a/frontend/chatgpt_wrap.py b/frontend/chatgpt_wrap.py new file mode 100644 index 0000000..dee8f49 --- /dev/null +++ b/frontend/chatgpt_wrap.py @@ -0,0 +1,72 @@ +import random + + +class ChatGPTWrap: + def __init__(self, use_chatgpt_placeholder = False): + self.use_chatgpt_placeholder = use_chatgpt_placeholder + + # true chatgpt + if not use_chatgpt_placeholder: + + print("Initializing ChatGPT... ", end="") + + with open("system_instructions.txt", "r", encoding="utf-8") as f: + self.system_inst_template = f.read() + + + #### true openai chat gpt initialization stuff below (everything that needs to be done only once) #### + raise NotImplementedError("True ChatGPT is not implemented yet!") + + + + + print("Done!") + + # placeholder chatgpt + else: + print("Using ChatGPT placeholder!") + self.message_idx = 0 + + + + def init_order(self, phone_number, order_items, delivery_address, payment_method): + self.phone_number = phone_number + self.order_items = order_items + self.delivery_address = delivery_address + self.payment_method = payment_method + + # true chatgpt + if not self.use_chatgpt_placeholder: + # generate system instructions from template + self.system_inst = self.system_inst_template.format( + phone_number = self.phone_number, + order_items = self.order_items, + delivery_address = self.delivery_address, + payment_method = self.payment_method + ) + + #### true openai chat gpt system instructions initialization stuff below #### + ##### (everything that needs to be done each assistant session like some chat gpt conversation cleanup) #### + + + def get_response(self, input_message): + # true chatgpt + if not self.use_chatgpt_placeholder: + #### true openai chat gpt response stuff below #### + + + + pass # return response + + # placeholder chatgpt + else: + choices = ( + self.phone_number, + self.order_items, + self.delivery_address, + self.payment_method + ) + + self.message_idx += 1 + + return f"czat dżi pi ti plejsholder {random.choice(choices)}{' CALLEND' if self.message_idx == 3 else ''}" \ No newline at end of file diff --git a/frontend/core.py b/frontend/core.py index 551a939..61a3247 100644 --- a/frontend/core.py +++ b/frontend/core.py @@ -1,5 +1,6 @@ from vad_recorder import VADRecorder from tts_stream import TTSStream +from chatgpt_wrap import ChatGPTWrap from faster_whisper import WhisperModel import torch import time @@ -8,7 +9,9 @@ import numpy as np class Core: - def __init__(self, whisper_model_name = "large-v3"): + def __init__(self, whisper_model_name = "large-v3", use_chatgpt_placeholder = False): + self.use_chatgpt_placeholder = use_chatgpt_placeholder + self.device = "cuda" if torch.cuda.is_available() else "cpu" print("\n=======================================") print(f"Using {self.device.capitalize()} for:") @@ -16,19 +19,21 @@ class Core: print(" - TTS") print("=======================================\n") - print("Loading Whisper model... ", end="") + print("Loading Faster Whisper model... ", end="") self.whisper_model = WhisperModel(whisper_model_name, device=self.device, compute_type="float16") print("Done!") - # VADRecorder and TTSStream have their own console loading messages + # VADRecorder, TTSStream and ChatGPTWrap have their own console loading messages self.vad_rec = VADRecorder() self.tts = TTSStream(device=self.device) + self.gpt_wrap = ChatGPTWrap(use_chatgpt_placeholder) - def set_order_settings(self, phone_number, order_items, delivery_address): + def set_order_settings(self, phone_number, order_items, delivery_address, payment_method): self.phone_number = phone_number self.order_items = order_items self.delivery_address = delivery_address + self.payment_method = payment_method def set_speech_recog_settings(self, speech_recog_timeout, audio_input_device_name, audio_output_device_name, window_size_sec, vad_threshold, min_silence_duration_ms, speech_pad_ms): @@ -62,7 +67,12 @@ class Core: ) print("Done!") - + self.gpt_wrap.init_order( + self.phone_number, + self.order_items, + self.delivery_address, + self.payment_method + ) @@ -116,9 +126,19 @@ class Core: print(speech_recog_text) print("-----------------------------------------\n\n") - time.sleep(1) # fake chatgpt delay + + + gpt_response = self.gpt_wrap.get_response(speech_recog_text) + + print("-----------------------------------------") + if self.use_chatgpt_placeholder: + print("!!!!! CHATGPT PLACEHOLDER RESPONSE !!!!!!") + else: + print("!!!!!!!!!!! CHATGPT RESPONSE !!!!!!!!!!!!") + print("-----------------------------------------") + print(gpt_response) + print("-----------------------------------------\n\n") - gpt_response = "czat dżi pi ti plejsholder 1 2 3" if not self.assistant_running: break @@ -127,7 +147,11 @@ class Core: # tts print("Speech synthesis stream started!") - self.tts.tts_speak(gpt_response) + self.tts.tts_speak(gpt_response.replace(" CALLEND", "")) + + + if "CALLEND" in gpt_response: + self.assistant_stop() #print(len(audios_for_whisper), time.perf_counter() - last_recog_time, len(speech_recog_text)) diff --git a/frontend/core_test.ipynb b/frontend/core_test.ipynb index 42e3c5f..51e6931 100644 --- a/frontend/core_test.ipynb +++ b/frontend/core_test.ipynb @@ -8,12 +8,16 @@ "source": [ "from core import Core\n", "\n", - "core = Core(whisper_model_name = \"large-v3\")\n", + "core = Core(\n", + " whisper_model_name = \"large-v3\",\n", + " use_chatgpt_placeholder = True\n", + ")\n", "\n", "core.set_order_settings(\n", - " phone_number = 123456789,\n", + " phone_number = \"123456789\",\n", " order_items = \"1x margharitta\\n2x sos majonezowy\",\n", - " delivery_address = \"ul. Amogusowa 1337, Suski Małe\"\n", + " delivery_address = \"ul. Amogusowa 1337, Suski Małe\",\n", + " payment_method = \"karta\"\n", ")\n", "\n", "core.set_speech_recog_settings(\n", diff --git a/frontend/system_instructions.txt b/frontend/system_instructions.txt new file mode 100644 index 0000000..93e01e4 --- /dev/null +++ b/frontend/system_instructions.txt @@ -0,0 +1,15 @@ +You are a bot that will act as a guy that orders a pizza you will be connected on the phone with a pizza place. you will be asked questions about example: Where should be the pizza delivered etc. and you will respond with the data in the data section also you HAVE to respond in a full sentence because it will be transformed into audio using a tts software so you cant use a list just make a sentence like: i would like a margharitta and a cocacola please. basically just write it like you would say it dont put numbers but put words that are numbers dont add shortcuts add the full word + +DATA: +phone: {{phone_number}} +delivery location: {{delivery_location}} +paymentMethod: {{payment_method}} +{{order_items}} + +REMEMBER DONT USE NUMBERS, USE WORDS for example dont say 1x, say one time also REMEMBER to use replacement words to a word so it is appropriate to the whole sentence example: +WRONG: +Chciałbym zamówić jedną Margherittę, dwie Colę, pięć Fant i jedną Sprite. +RIGHT: +Chciałbym zamówić jedną Margarittę, dwie Kole, pięć Fant i jednego Sprajta. + +If the call ends, say at the end of your final response "CALLEND" \ No newline at end of file diff --git a/frontend/tts_stream.py b/frontend/tts_stream.py index 65e11ea..1dc78ef 100644 --- a/frontend/tts_stream.py +++ b/frontend/tts_stream.py @@ -78,6 +78,9 @@ class TTSStream: def tts_speak(self, text): self.play_buffer_size = 512 + # separate long sequences of numbers in text string (for example 123456789) into packets of 3 (123 456 789) + text = re.sub(r"(\d{3})(?=\d)", r"\1 ", text) + # open pyaudio stream p = pyaudio.PyAudio()