Spaces:
Running
Running
import base64 | |
import tempfile | |
import os | |
import requests | |
import gradio as gr | |
import random | |
from openai import OpenAI | |
# Available voices for audio generation | |
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"] | |
# Example audio URLs | |
EXAMPLE_AUDIO_URLS = [ | |
"https://cdn.openai.com/API/docs/audio/alloy.wav", | |
"https://cdn.openai.com/API/docs/audio/ash.wav", | |
"https://cdn.openai.com/API/docs/audio/coral.wav", | |
"https://cdn.openai.com/API/docs/audio/echo.wav", | |
"https://cdn.openai.com/API/docs/audio/fable.wav", | |
"https://cdn.openai.com/API/docs/audio/onyx.wav", | |
"https://cdn.openai.com/API/docs/audio/nova.wav", | |
"https://cdn.openai.com/API/docs/audio/sage.wav", | |
"https://cdn.openai.com/API/docs/audio/shimmer.wav" | |
] | |
def process_text_input(api_key, text_prompt, selected_voice): | |
"""Generate audio response from text input""" | |
try: | |
# Initialize OpenAI client with the provided API key | |
client = OpenAI(api_key=api_key) | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": selected_voice, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": text_prompt | |
} | |
] | |
) | |
# Save the audio to a temporary file | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
# Get the text response directly from the API | |
text_response = completion.choices[0].message.content | |
return text_response, temp_path | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
def process_audio_input(api_key, audio_path, text_prompt, selected_voice): | |
"""Process audio input and generate a response""" | |
try: | |
if not audio_path: | |
return "Please upload or record audio first.", None | |
# Initialize OpenAI client with the provided API key | |
client = OpenAI(api_key=api_key) | |
# Read audio file and encode to base64 | |
with open(audio_path, "rb") as audio_file: | |
audio_data = audio_file.read() | |
encoded_audio = base64.b64encode(audio_data).decode('utf-8') | |
# Create message content with both text and audio | |
message_content = [] | |
if text_prompt: | |
message_content.append({ | |
"type": "text", | |
"text": text_prompt | |
}) | |
message_content.append({ | |
"type": "input_audio", | |
"input_audio": { | |
"data": encoded_audio, | |
"format": "wav" | |
} | |
}) | |
# Call OpenAI API | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": selected_voice, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": message_content | |
} | |
] | |
) | |
# Save the audio response | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
# Get the text response | |
text_response = completion.choices[0].message.content | |
return text_response, temp_path | |
except Exception as e: | |
return f"Error: {str(e)}", None | |
def transcribe_audio(api_key, audio_path): | |
"""Transcribe an audio file using OpenAI's API""" | |
try: | |
if not audio_path: | |
return "No audio file provided for transcription." | |
client = OpenAI(api_key=api_key) | |
with open(audio_path, "rb") as audio_file: | |
transcription = client.audio.transcriptions.create( | |
model="gpt-4o-transcribe", | |
file=audio_file | |
) | |
return transcription.text | |
except Exception as e: | |
return f"Transcription error: {str(e)}" | |
def download_example_audio(): | |
"""Download a random example audio file for testing""" | |
try: | |
# Randomly select one of the example audio URLs | |
url = random.choice(EXAMPLE_AUDIO_URLS) | |
# Get the voice name from the URL for feedback | |
voice_name = url.split('/')[-1].split('.')[0] | |
response = requests.get(url) | |
response.raise_for_status() | |
# Save to a temporary file | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(response.content) | |
return temp_path, f"Loaded example voice: {voice_name}" | |
except Exception as e: | |
return None, f"Error loading example: {str(e)}" | |
def use_example_audio(): | |
"""Load random example audio for the interface""" | |
audio_path, message = download_example_audio() | |
return audio_path, message | |
# Create Gradio Interface | |
with gr.Blocks(title="OpenAI Audio Chat App") as app: | |
gr.Markdown("# OpenAI Audio Chat App") | |
gr.Markdown("Interact with GPT-4o audio model through text and audio inputs") | |
# API Key input (used across all tabs) | |
api_key = gr.Textbox( | |
label="OpenAI API Key", | |
placeholder="Enter your OpenAI API key here", | |
type="password" | |
) | |
with gr.Tab("Text to Audio"): | |
with gr.Row(): | |
with gr.Column(): | |
text_input = gr.Textbox( | |
label="Text Prompt", | |
placeholder="Enter your question or prompt here...", | |
lines=3 | |
) | |
text_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Voice" | |
) | |
text_submit = gr.Button("Generate Response") | |
with gr.Column(): | |
text_output = gr.Textbox(label="AI Response (Checks Error)", lines=5) | |
audio_output = gr.Audio(label="AI Response (Audio)") | |
transcribed_output = gr.Textbox(label="Transcription of Audio Response", lines=3) | |
# Function to process text input and then transcribe the resulting audio | |
def text_input_with_transcription(api_key, text_prompt, voice): | |
text_response, audio_path = process_text_input(api_key, text_prompt, voice) | |
# Get transcription of the generated audio | |
if audio_path: | |
transcription = transcribe_audio(api_key, audio_path) | |
else: | |
transcription = "No audio generated to transcribe." | |
return text_response, audio_path, transcription | |
text_submit.click( | |
fn=text_input_with_transcription, | |
inputs=[api_key, text_input, text_voice], | |
outputs=[text_output, audio_output, transcribed_output] | |
) | |
with gr.Tab("Audio Input to Audio Response"): | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
label="Audio Input", | |
type="filepath", | |
sources=["microphone", "upload"] | |
) | |
example_btn = gr.Button("Use Random Example Audio") | |
example_message = gr.Textbox(label="Example Status", interactive=False) | |
accompanying_text = gr.Textbox( | |
label="Accompanying Text (Optional)", | |
placeholder="Add any text context or question about the audio...", | |
lines=2 | |
) | |
audio_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Response Voice" | |
) | |
audio_submit = gr.Button("Process Audio & Generate Response") | |
with gr.Column(): | |
audio_text_output = gr.Textbox(label="AI Response (Checks Error)", lines=5) | |
audio_audio_output = gr.Audio(label="AI Response (Audio)") | |
audio_transcribed_output = gr.Textbox(label="Transcription of Audio Response", lines=3) | |
input_transcription = gr.Textbox(label="Transcription of Input Audio", lines=3) | |
# Function to process audio input, generate response, and provide transcriptions | |
def audio_input_with_transcription(api_key, audio_path, text_prompt, voice): | |
# First transcribe the input audio | |
input_transcription = "N/A" | |
if audio_path: | |
input_transcription = transcribe_audio(api_key, audio_path) | |
# Process the audio input and get response | |
text_response, response_audio_path = process_audio_input(api_key, audio_path, text_prompt, voice) | |
# Transcribe the response audio | |
response_transcription = "No audio generated to transcribe." | |
if response_audio_path: | |
response_transcription = transcribe_audio(api_key, response_audio_path) | |
return text_response, response_audio_path, response_transcription, input_transcription | |
audio_submit.click( | |
fn=audio_input_with_transcription, | |
inputs=[api_key, audio_input, accompanying_text, audio_voice], | |
outputs=[audio_text_output, audio_audio_output, audio_transcribed_output, input_transcription] | |
) | |
example_btn.click( | |
fn=use_example_audio, | |
inputs=[], | |
outputs=[audio_input, example_message] | |
) | |
with gr.Tab("Voice Samples"): | |
gr.Markdown("## Listen to samples of each voice") | |
def generate_voice_sample(api_key, voice_type): | |
try: | |
if not api_key: | |
return "Please enter your OpenAI API key first.", None, "No transcription available." | |
client = OpenAI(api_key=api_key) | |
completion = client.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
modalities=["text", "audio"], | |
audio={"voice": voice_type, "format": "wav"}, | |
messages=[ | |
{ | |
"role": "user", | |
"content": f"This is a sample of the {voice_type} voice. It has its own unique tone and character." | |
} | |
] | |
) | |
# Save the audio to a temporary file | |
wav_bytes = base64.b64decode(completion.choices[0].message.audio.data) | |
temp_path = tempfile.mktemp(suffix=".wav") | |
with open(temp_path, "wb") as f: | |
f.write(wav_bytes) | |
# Get transcription | |
transcription = transcribe_audio(api_key, temp_path) | |
return f"Sample generated with voice: {voice_type}", temp_path, transcription | |
except Exception as e: | |
return f"Error: {str(e)}", None, "No transcription available." | |
with gr.Row(): | |
sample_voice = gr.Dropdown( | |
choices=VOICES, | |
value="alloy", | |
label="Select Voice Sample" | |
) | |
sample_btn = gr.Button("Generate Sample") | |
with gr.Row(): | |
sample_text = gr.Textbox(label="Status") | |
sample_audio = gr.Audio(label="Voice Sample") | |
sample_transcription = gr.Textbox(label="Transcription", lines=3) | |
sample_btn.click( | |
fn=generate_voice_sample, | |
inputs=[api_key, sample_voice], | |
outputs=[sample_text, sample_audio, sample_transcription] | |
) | |
gr.Markdown(""" | |
## Notes: | |
- You must provide your OpenAI API key in the field above | |
- The model used is `gpt-4o-audio-preview` for conversation and `gpt-4o-transcribe` for transcriptions | |
- Audio inputs should be in WAV format | |
- Available voices: alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse | |
- Each audio response is automatically transcribed for verification | |
- The "Use Random Example Audio" button will load a random sample from OpenAI's demo voices | |
""") | |
if __name__ == "__main__": | |
app.launch() |