Dave and I build an incredible sounding Multi Modal voice system that reads text.
import numpy as np
import uuid
from scipy.io.wavfile import write as write_wav
import openai
from bark import generate_audio, SAMPLE_RATE
# Initialize OpenAI API with your key
openai_api_key = "key"
openai.api_key = openai_api_key
def analyze_and_split_text(input_text):
# Use GPT-3.5 turbo to analyze the input text, infer emotions, and split it into meaningful segments
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{'role': 'system', 'content': 'You are an expert in analyzing text and inferring emotions. Analyze the following text and split it into meaningful segments, providing the emotions for each segment. Return the segments in the following format: "Segment: [text] - Emotion: [emotion]". Consider the following special features: [laughter] for laughter, [sighs] for sighs, [music] for music, [gasps] for gasps, [clears throat] for clearing throat, ... for hesitations, ♪ for song lyrics, CAPITALIZATION for emphasis, [MAN] for male speakers, and [WOMAN] for female speakers.'},
{'role': 'user', 'content': input_text}
],
)
return response['choices'][0]['message']['content'].split('\n')
def generate_audio_with_emotion(segment, emotion):
# Map emotions to Bark's special tags
emotion_to_tag = {
'laughter': '[laughter]',
'joy': '[laughs]',
'sadness': '[sighs]',
'music': '[music]',
'surprise': '[gasps]',
'hesitation': '...',
'song': '♪',
'emphasis': 'CAPITALIZATION',
'male': '[MAN]',
'female': '[WOMAN]'
}
# Apply the corresponding tag based on the detected emotion
tag = emotion_to_tag.get(emotion.lower())
if tag:
segment = tag + ' ' + segment
# Generate audio using Bark with the modified text
audio_array = generate_audio(segment, history_prompt="v2/en_speaker_6")
return audio_array
def generate_response(message):
# Analyze the text for emotions and split it into meaningful segments
segments_analysis = analyze_and_split_text(message)
# Process each segment and generate audio
pieces = []
for segment_analysis in segments_analysis:
try:
segment, emotion = segment_analysis.split(' - Emotion: ')
segment = segment.replace('Segment: ', '').strip()
audio_array = generate_audio_with_emotion(segment, emotion.strip())
silence = np.zeros(int(0.75 * SAMPLE_RATE)) # quarter second of silence
pieces += [audio_array, silence.copy()]
except ValueError:
print(f"Error processing segment: {segment_analysis}")
continue
# Concatenate all audio pieces
audio = np.concatenate(pieces)
# Generate a random file name
wav_file_name = str(uuid.uuid4()) + ".wav"
# Save the audio to a WAV file in the current directory
write_wav(wav_file_name, SAMPLE_RATE, audio)
print(f"Audio file generated: {wav_file_name}")
# Test the function with a message
generate_response("As the sun dipped below the horizon, painting the sky with hues of orange and pink, a small robotic explorer rolled across the Martian landscape. Its sensors whirred and clicked, capturing data and images of the alien terrain. Far from its creators on Earth, it was a lone sentinel in a vast, red desert. Its mission was one of discovery and curiosity, a quest to unravel the mysteries of a distant world. Each rock analyzed, each soil sample taken, brought humanity one step closer to understanding our place in the cosmos. The robot's mechanical voice, programmed to narrate its findings, spoke with a sense of wonder and determination. 'Exploration is not just a journey,' it said, 'it's a symbol of our endless pursuit of knowledge and our unquenchable thirst for the unknown.' With those words, it continued its solitary trek under the Martian stars.")
https://cdn.discordapp.com/attachments/1083129341870878780/1140803515824996392/dave.clemson.wav