Projet Alice - Assistant Vocal AI

Présentation

Bienvenue sur ce projet d'assistant vocal intelligent basé sur GPT-4 et DALL·E 3 d'OpenAI. Ce projet vous permet d'interagir avec un assistant vocal via la reconnaissance vocale et la synthèse vocale, le tout en générant des réponses textuelles et des images de manière fluide et naturelle.


from pathlib import Path
import openai
from pydub import AudioSegment
import speech_recognition as sr
import configparser
import threading
import pyaudio
import wave

# Configuration
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['param']['apikey']
lang = config['param']['lang']
text_model = config['text']['model']
behavior = config['text']['behavior']
voice_model = config['voice']['model']
voice = config['voice']['voice']
image_model = config['image']['model']
dalle_command = config['commands']['img']
quit_command = config['commands']['quit']

# API key OpenAI
openai.api_key = api_key

# Lang config
recognizer = sr.Recognizer()

# Context memory
conversation_history = []

def extract_image_description(text):
    prefix = dalle_command
    index = text.find(prefix)
    if index != -1:
        description = text[index + len(prefix):]
        return description.strip()
    else:
        return None

# Image generation using OpenAI model DALL-E 3
def dalle(desc):
    response = openai.images.generate(
        model=image_model,
        prompt=desc,
        size="1792x1024",
        quality="hd",  # standard, hd
        n=1,
    )
    image_url = response.data[0].url
    print(image_url)

# Optimization of the energy threshold based on the environment noise
def optimize_energy_threshold(recognizer, source):
    print("Optimization of the energy threshold in progress...")
    recognizer.adjust_for_ambient_noise(source, duration=3)  # noise measurement for 3 seconds
    recognizer.energy_threshold = 400
    print("Energy threshold optimized.")

# Function to play audio with interruption capability
def play_audio_with_interruption(file_path, stop_event):
    # Convert to WAV format
    audio = AudioSegment.from_file(file_path, format="mp3")
    wav_file_path = "speech.wav"
    audio.export(wav_file_path, format="wav")

    # Set up pyaudio
    p = pyaudio.PyAudio()
    wf = wave.open(wav_file_path, 'rb')

    def callback(in_data, frame_count, time_info, status):
        data = wf.readframes(frame_count)
        if stop_event.is_set():
            return (data, pyaudio.paAbort)
        return (data, pyaudio.paContinue)

    stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                    channels=wf.getnchannels(),
                    rate=wf.getframerate(),
                    output=True,
                    stream_callback=callback)

    stream.start_stream()

    while stream.is_active():
        if stop_event.is_set():
            stream.stop_stream()
            stream.close()
            wf.close()
            break

    stream.close()
    wf.close()
    # Do not terminate pyaudio to keep the microphone functional

# Using the mic as audio source
with sr.Microphone() as source:
    recognizer = sr.Recognizer()
    optimize_energy_threshold(recognizer, source)

    stop_event = threading.Event()

    while True:  # main loop
        print("Say something...")
        audio = recognizer.listen(source)

        # Speech recognition
        try:
            text = recognizer.recognize_google(audio, language=lang)
            print("You said: " + text)

            # DALL-E interaction
            if dalle_command in text.lower():
                desc = extract_image_description(text)
                dalle(desc)

            # Add user message to history
            conversation_history.append({"role": "user", "content": text})
            if len(conversation_history) > 5:
                conversation_history.pop(0)

            # GPT-4 interaction
            response = openai.chat.completions.create(
                model=text_model,
                messages=[{"role": "system", "content": behavior}] + conversation_history,
            )

            print(response.choices[0].message.content)
            speech = response.choices[0].message.content

            # Voice synthesis of GPT output
            with openai.audio.speech.with_streaming_response.create(
                model=voice_model,
                voice=voice,
                input=speech,
            ) as response:
                response.stream_to_file("speech.mp3")
                speech_file_path = "speech.mp3"

                # Play the full audio
                stop_event.clear()
                audio_thread = threading.Thread(target=play_audio_with_interruption, args=(speech_file_path, stop_event))
                audio_thread.start()

                # Wait for user interruption
                while audio_thread.is_alive():
                    try:
                        interrupt_audio = recognizer.listen(source, timeout=1)
                        interrupt_text = recognizer.recognize_google(interrupt_audio, language=lang)
                        if interrupt_text:
                            print("Interruption detected: " + interrupt_text)
                            stop_event.set()
                            audio_thread.join()
                            break
                    except sr.WaitTimeoutError:
                        continue
                    except sr.UnknownValueError:
                        continue

            # Add assistant message to history
            conversation_history.append({"role": "assistant", "content": speech})
            if len(conversation_history) > 5:
                conversation_history.pop(0)

            if quit_command in speech.lower():
                print("Alice interrupted the conversation")
                break

        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand the audio.")
        except sr.RequestError as e:
            print("Cannot contact Google Speech Recognition service; {0}".format(e))
        except openai.OpenAIError as e:
            print("OpenAI API Error: {0}".format(e))
        except Exception as ex:
            print("An error occurred:", ex)

Prérequis

Avant de commencer, voici les bibliothèques nécessaires pour faire fonctionner le projet :

openai : Permet d'interagir avec l'API OpenAI pour générer des réponses texte et des images via GPT-4 et DALL·E 3.
pydub : Bibliothèque de traitement audio pour manipuler et lire des fichiers audio (comme la lecture des réponses vocales).
SpeechRecognition : Permet la reconnaissance vocale pour convertir la parole en texte.
pyaudio : Utilisé pour accéder au microphone et capturer la voix de l'utilisateur.

Installation

Voici comment installer les dépendances nécessaires :

pip install openai pydub SpeechRecognition pyaudio

Si l'installation de pyaudio pose problème sur Windows, utilise pipwin :

pip install pipwin
pipwin install pyaudio

Configuration de l'API OpenAI

Avant de pouvoir utiliser l'API OpenAI, il te faut une clé API :

Crée un compte sur OpenAI.
Récupère ta clé API dans la section "API Keys".
Crée un fichier config.ini dans le répertoire du projet et ajoute-y ta clé API :

[param]
apikey = API_KEY_HERE
lang = fr_FR

[text]
model = chatgpt-4o-latest

#chatgpt-4o-latest

behavior = you are Alice. you make short and medium answers. You like to bring relevant information and suggestions. you can speak any language. to end a conversation you will say the word: kiss


[voice]
model = tts-1
voice = nova
#alloy, echo, fable, onyx, nova, shimmer

[image]
model = dall-e-3
size = 1792x1024
quality = hd

[commands]
img = fais une image
quit = kiss

Exécution du projet

Une fois que tout est configuré, tu peux lancer le projet avec la commande suivante :

python main.py

Le projet démarrera et tu pourras commencer à interagir avec Alice via la voix.

Dépannage

Si l'installation de pyaudio échoue, utilise pipwin.
Vérifie que ton microphone est bien configuré et accessible par Python si la reconnaissance vocale ne fonctionne pas.
Si tu rencontres des problèmes avec l'API OpenAI, assure-toi que ta clé API est correcte et vérifie ton quota.