Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor | |
from qwen_omni_utils import process_mm_info | |
import soundfile as sf | |
import tempfile | |
import spaces | |
# Initialize the model and processor | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16 | |
model = Qwen2_5OmniModel.from_pretrained( | |
"Qwen/Qwen2.5-Omni-7B", | |
torch_dtype=torch_dtype, | |
device_map="auto", | |
enable_audio_output=True, | |
# attn_implementation="flash_attention_2" if torch.cuda.is_available() else None | |
) | |
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B") | |
# System prompt | |
SYSTEM_PROMPT = { | |
"role": "system", | |
"content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech." | |
} | |
# Voice options | |
VOICE_OPTIONS = { | |
"Chelsie (Female)": "Chelsie", | |
"Ethan (Male)": "Ethan" | |
} | |
def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output): | |
# Combine multimodal inputs | |
user_input = { | |
"text": text, | |
"image": image if image is not None else None, | |
"audio": audio if audio is not None else None, | |
"video": video if video is not None else None | |
} | |
# Prepare conversation history for model processing | |
conversation = [SYSTEM_PROMPT] | |
# Add previous chat history | |
if isinstance(chat_history, list): | |
for item in chat_history: | |
if isinstance(item, tuple) and len(item) == 2: | |
user_msg, bot_msg = item | |
conversation.append({"role": "user", "content": user_input_to_content(user_msg)}) | |
conversation.append({"role": "assistant", "content": bot_msg}) | |
else: | |
# Initialize chat history if it's not a list | |
chat_history = [] | |
# Add current user input | |
conversation.append({"role": "user", "content": user_input_to_content(user_input)}) | |
# Prepare for inference | |
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) | |
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True) | |
inputs = processor( | |
text=text, | |
audios=audios, | |
images=images, | |
videos=videos, | |
return_tensors="pt", | |
padding=True | |
) | |
inputs = inputs.to(model.device).to(model.dtype) | |
# Generate response | |
if enable_audio_output: | |
voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie") | |
text_ids, audio = model.generate( | |
**inputs, | |
use_audio_in_video=True, | |
return_audio=True, | |
spk=voice_type_value | |
) | |
# Save audio to temporary file | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
sf.write( | |
tmp_file.name, | |
audio.reshape(-1).detach().cpu().numpy(), | |
samplerate=24000, | |
) | |
audio_path = tmp_file.name | |
else: | |
text_ids = model.generate( | |
**inputs, | |
use_audio_in_video=True, | |
return_audio=False | |
) | |
audio_path = None | |
# Decode text response | |
text_response = processor.batch_decode( | |
text_ids, | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=False | |
)[0] | |
# Clean up text response | |
text_response = text_response.strip() | |
# Format user message for chat history display | |
user_message_for_display = str(text) if text is not None else "" | |
if image is not None: | |
user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]" | |
if audio is not None: | |
user_message_for_display = (user_message_for_display or "Audio uploaded") + " [Audio]" | |
if video is not None: | |
user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]" | |
# If empty, provide a default message | |
if not user_message_for_display.strip(): | |
user_message_for_display = "Multimodal input" | |
# Update chat history with properly formatted entries | |
if not isinstance(chat_history, list): | |
chat_history = [] | |
chat_history.append((user_message_for_display, text_response)) | |
# Prepare output | |
if enable_audio_output and audio_path: | |
return chat_history, text_response, audio_path | |
else: | |
return chat_history, text_response, None | |
def user_input_to_content(user_input): | |
if isinstance(user_input, str): | |
return user_input | |
elif isinstance(user_input, dict): | |
# Handle file uploads | |
content = [] | |
if "text" in user_input and user_input["text"]: | |
content.append({"type": "text", "text": user_input["text"]}) | |
if "image" in user_input and user_input["image"]: | |
content.append({"type": "image", "image": user_input["image"]}) | |
if "audio" in user_input and user_input["audio"]: | |
content.append({"type": "audio", "audio": user_input["audio"]}) | |
if "video" in user_input and user_input["video"]: | |
content.append({"type": "video", "video": user_input["video"]}) | |
return content | |
return user_input | |
def create_demo(): | |
with gr.Blocks(title="Qwen2.5-Omni Chat Demo", theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo") | |
gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.") | |
# Hidden placeholder components for text-only input | |
placeholder_image = gr.Image(type="filepath", visible=False) | |
placeholder_audio = gr.Audio(type="filepath", visible=False) | |
placeholder_video = gr.Video(visible=False) | |
# Chat interface | |
with gr.Row(): | |
with gr.Column(scale=3): | |
chatbot = gr.Chatbot(height=600) | |
with gr.Accordion("Advanced Options", open=False): | |
voice_type = gr.Dropdown( | |
choices=list(VOICE_OPTIONS.keys()), | |
value="Chelsie (Female)", | |
label="Voice Type" | |
) | |
enable_audio_output = gr.Checkbox( | |
value=True, | |
label="Enable Audio Output" | |
) | |
# Multimodal input components | |
with gr.Tabs(): | |
with gr.TabItem("Text Input"): | |
text_input = gr.Textbox( | |
placeholder="Type your message here...", | |
label="Text Input" | |
) | |
text_submit = gr.Button("Send Text") | |
with gr.TabItem("Multimodal Input"): | |
with gr.Row(): | |
image_input = gr.Image( | |
type="filepath", | |
label="Upload Image" | |
) | |
audio_input = gr.Audio( | |
type="filepath", | |
label="Upload Audio" | |
) | |
with gr.Row(): | |
video_input = gr.Video( | |
label="Upload Video" | |
) | |
additional_text = gr.Textbox( | |
placeholder="Additional text message...", | |
label="Additional Text" | |
) | |
multimodal_submit = gr.Button("Send Multimodal Input") | |
clear_button = gr.Button("Clear Chat") | |
with gr.Column(scale=1): | |
gr.Markdown("## Model Capabilities") | |
gr.Markdown(""" | |
**Qwen2.5-Omni can:** | |
- Process and understand text | |
- Analyze images and answer questions about them | |
- Transcribe and understand audio | |
- Analyze video content (with or without audio) | |
- Generate natural speech responses | |
""") | |
gr.Markdown("### Example Prompts") | |
gr.Examples( | |
examples=[ | |
["Describe what you see in this image", "image"], | |
["What is being said in this audio clip?", "audio"], | |
["What's happening in this video?", "video"], | |
["Explain quantum computing in simple terms", "text"], | |
["Generate a short story about a robot learning to paint", "text"] | |
], | |
inputs=[text_input, gr.Textbox(visible=False)], | |
label="Text Examples" | |
) | |
audio_output = gr.Audio( | |
label="Model Speech Output", | |
visible=True, | |
autoplay=True | |
) | |
text_output = gr.Textbox( | |
label="Model Text Response", | |
interactive=False | |
) | |
# Text input handling | |
text_submit.click( | |
fn=lambda text: str(text) if text is not None else "", | |
inputs=text_input, | |
outputs=[chatbot], | |
queue=False | |
).then( | |
fn=process_input, | |
inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output], | |
outputs=[chatbot, text_output, audio_output] | |
) | |
# Multimodal input handling | |
def prepare_multimodal_input(image, audio, video, text): | |
# Create a display message that indicates what was uploaded | |
display_message = str(text) if text is not None else "" | |
if image is not None: | |
display_message = (display_message + " " if display_message.strip() else "") + "[Image]" | |
if audio is not None: | |
display_message = (display_message + " " if display_message.strip() else "") + "[Audio]" | |
if video is not None: | |
display_message = (display_message + " " if display_message.strip() else "") + "[Video]" | |
if not display_message.strip(): | |
display_message = "Multimodal content" | |
return display_message | |
multimodal_submit.click( | |
fn=prepare_multimodal_input, | |
inputs=[image_input, audio_input, video_input, additional_text], | |
outputs=[chatbot], | |
queue=False | |
).then( | |
fn=process_input, | |
inputs=[image_input, audio_input, video_input, additional_text, | |
chatbot, voice_type, enable_audio_output], | |
outputs=[chatbot, text_output, audio_output] | |
) | |
# Clear chat | |
def clear_chat(): | |
return [], None, None | |
clear_button.click( | |
fn=clear_chat, | |
outputs=[chatbot, text_output, audio_output] | |
) | |
# Update audio output visibility | |
def toggle_audio_output(enable_audio): | |
return gr.Audio(visible=enable_audio) | |
enable_audio_output.change( | |
fn=toggle_audio_output, | |
inputs=enable_audio_output, | |
outputs=audio_output | |
) | |
return demo | |
if __name__ == "__main__": | |
demo = create_demo() | |
demo.launch(server_name="0.0.0.0", server_port=7860) |