from pathlib import Path
import openai
from pydub import AudioSegment
import speech_recognition as sr
import configparser
import threading
import pyaudio
import wave
# Configuration
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config['param']['apikey']
lang = config['param']['lang']
text_model = config['text']['model']
behavior = config['text']['behavior']
voice_model = config['voice']['model']
voice = config['voice']['voice']
image_model = config['image']['model']
dalle_command = config['commands']['img']
quit_command = config['commands']['quit']
# API key OpenAI
openai.api_key = api_key
# Lang config
recognizer = sr.Recognizer()
# Context memory
conversation_history = []
def extract_image_description(text):
prefix = dalle_command
index = text.find(prefix)
if index != -1:
description = text[index + len(prefix):]
return description.strip()
else:
return None
# Image generation using OpenAI model DALL-E 3
def dalle(desc):
response = openai.images.generate(
model=image_model,
prompt=desc,
size="1792x1024",
quality="hd", # standard, hd
n=1,
)
image_url = response.data[0].url
print(image_url)
# Optimization of the energy threshold based on the environment noise
def optimize_energy_threshold(recognizer, source):
print("Optimization of the energy threshold in progress...")
recognizer.adjust_for_ambient_noise(source, duration=3) # noise measurement for 3 seconds
recognizer.energy_threshold = 400
print("Energy threshold optimized.")
# Function to play audio with interruption capability
def play_audio_with_interruption(file_path, stop_event):
# Convert to WAV format
audio = AudioSegment.from_file(file_path, format="mp3")
wav_file_path = "speech.wav"
audio.export(wav_file_path, format="wav")
# Set up pyaudio
p = pyaudio.PyAudio()
wf = wave.open(wav_file_path, 'rb')
def callback(in_data, frame_count, time_info, status):
data = wf.readframes(frame_count)
if stop_event.is_set():
return (data, pyaudio.paAbort)
return (data, pyaudio.paContinue)
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True,
stream_callback=callback)
stream.start_stream()
while stream.is_active():
if stop_event.is_set():
stream.stop_stream()
stream.close()
wf.close()
break
stream.close()
wf.close()
# Do not terminate pyaudio to keep the microphone functional
# Using the mic as audio source
with sr.Microphone() as source:
recognizer = sr.Recognizer()
optimize_energy_threshold(recognizer, source)
stop_event = threading.Event()
while True: # main loop
print("Say something...")
audio = recognizer.listen(source)
# Speech recognition
try:
text = recognizer.recognize_google(audio, language=lang)
print("You said: " + text)
# DALL-E interaction
if dalle_command in text.lower():
desc = extract_image_description(text)
dalle(desc)
# Add user message to history
conversation_history.append({"role": "user", "content": text})
if len(conversation_history) > 5:
conversation_history.pop(0)
# GPT-4 interaction
response = openai.chat.completions.create(
model=text_model,
messages=[{"role": "system", "content": behavior}] + conversation_history,
)
print(response.choices[0].message.content)
speech = response.choices[0].message.content
# Voice synthesis of GPT output
with openai.audio.speech.with_streaming_response.create(
model=voice_model,
voice=voice,
input=speech,
) as response:
response.stream_to_file("speech.mp3")
speech_file_path = "speech.mp3"
# Play the full audio
stop_event.clear()
audio_thread = threading.Thread(target=play_audio_with_interruption, args=(speech_file_path, stop_event))
audio_thread.start()
# Wait for user interruption
while audio_thread.is_alive():
try:
interrupt_audio = recognizer.listen(source, timeout=1)
interrupt_text = recognizer.recognize_google(interrupt_audio, language=lang)
if interrupt_text:
print("Interruption detected: " + interrupt_text)
stop_event.set()
audio_thread.join()
break
except sr.WaitTimeoutError:
continue
except sr.UnknownValueError:
continue
# Add assistant message to history
conversation_history.append({"role": "assistant", "content": speech})
if len(conversation_history) > 5:
conversation_history.pop(0)
if quit_command in speech.lower():
print("Alice interrupted the conversation")
break
except sr.UnknownValueError:
print("Google Speech Recognition could not understand the audio.")
except sr.RequestError as e:
print("Cannot contact Google Speech Recognition service; {0}".format(e))
except openai.OpenAIError as e:
print("OpenAI API Error: {0}".format(e))
except Exception as ex:
print("An error occurred:", ex)
Installation
Voici comment installer les dépendances nécessaires :
pip install openai pydub SpeechRecognition pyaudio
Si l'installation de pyaudio
pose problème sur Windows, utilise pipwin
:
pip install pipwin
pipwin install pyaudio
Configuration de l'API OpenAI
Avant de pouvoir utiliser l'API OpenAI, il te faut une clé API :
- Crée un compte sur OpenAI.
- Récupère ta clé API dans la section "API Keys".
- Crée un fichier
config.ini
dans le répertoire du projet et ajoute-y ta clé API :
[param]
apikey = API_KEY_HERE
lang = fr_FR
[text]
model = chatgpt-4o-latest
#chatgpt-4o-latest
behavior = you are Alice. you make short and medium answers. You like to bring relevant information and suggestions. you can speak any language. to end a conversation you will say the word: kiss
[voice]
model = tts-1
voice = nova
#alloy, echo, fable, onyx, nova, shimmer
[image]
model = dall-e-3
size = 1792x1024
quality = hd
[commands]
img = fais une image
quit = kiss