Spaces:

aimeri
/

Qwen2.5-Omni-7B-Demo

Build error

File size: 19,785 Bytes

import gradio as gr
import torch
from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, TextStreamer
from qwen_omni_utils import process_mm_info
import soundfile as sf
import tempfile
import spaces
import gc

# Initialize the model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16

def get_model():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    model = Qwen2_5OmniModel.from_pretrained(
        "Qwen/Qwen2.5-Omni-7B",
        torch_dtype=torch_dtype,
        device_map="auto",
        enable_audio_output=True,
        low_cpu_mem_usage=True,
        attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
    )
    return model

model = get_model()
processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

# System prompt
SYSTEM_PROMPT = {
    "role": "system",
    "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
}

# Voice options
VOICE_OPTIONS = {
    "Chelsie (Female)": "Chelsie",
    "Ethan (Male)": "Ethan"
}

@spaces.GPU
def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
    try:
        # Clear GPU memory before processing
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            gc.collect()
        
        # Initialize user_message_for_display at the start
        user_message_for_display = str(text) if text is not None else ""
        if image is not None:
            user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Image]"
        if audio is not None:
            user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Audio]"
        if video is not None:
            user_message_for_display = (user_message_for_display + " " if user_message_for_display.strip() else "") + "[Video]"
        
        # If empty, provide a default message
        if not user_message_for_display.strip():
            user_message_for_display = "Multimodal input"
        
        # Combine multimodal inputs
        user_input = {
            "text": text,
            "image": image if image is not None else None,
            "audio": audio if audio is not None else None,
            "video": video if video is not None else None
        }
        
        # Prepare conversation history for model processing
        conversation = [SYSTEM_PROMPT]
        
        # Add previous chat history
        if isinstance(chat_history, list):
            for message in chat_history:
                if isinstance(message, dict) and "role" in message and "content" in message:
                    # Messages are already in the correct format
                    conversation.append(message)
                elif isinstance(message, list) and len(message) == 2:
                    # Convert old format to new format
                    user_msg, bot_msg = message
                    if bot_msg is not None:  # Only add complete message pairs
                        # Convert display format back to processable format
                        processed_msg = user_msg
                        if "[Image]" in user_msg:
                            processed_msg = {"type": "text", "text": user_msg.replace("[Image]", "").strip()}
                        if "[Audio]" in user_msg:
                            processed_msg = {"type": "text", "text": user_msg.replace("[Audio]", "").strip()}
                        if "[Video]" in user_msg:
                            processed_msg = {"type": "text", "text": user_msg.replace("[Video]", "").strip()}
                        
                        conversation.append({"role": "user", "content": processed_msg})
                        conversation.append({"role": "assistant", "content": bot_msg})
        
        # Add current user input
        conversation.append({"role": "user", "content": user_input_to_content(user_input)})
        
        # Prepare for inference
        model_input = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
        try:
            audios, images, videos = process_mm_info(conversation, use_audio_in_video=False)  # Default to no audio in video
        except Exception as e:
            print(f"Error processing multimedia: {str(e)}")
            audios, images, videos = [], [], []  # Fallback to empty lists
        
        inputs = processor(
            text=model_input, 
            audios=audios, 
            images=images, 
            videos=videos, 
            return_tensors="pt", 
            padding=True
        )
        
        # Move inputs to device and convert dtype
        inputs = {k: v.to(device=model.device, dtype=model.dtype) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        # Generate response with streaming
        try:
            text_ids = None
            audio_path = None
            generation_output = None
            
            if enable_audio_output:
                voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
                try:
                    generation_output = model.generate(
                        **inputs, 
                        use_audio_in_video=False,
                        return_audio=True,
                        spk=voice_type_value,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        streamer=TextStreamer(processor, skip_prompt=True)
                    )
                    
                    if generation_output is not None and isinstance(generation_output, tuple) and len(generation_output) == 2:
                        text_ids, audio = generation_output
                        if audio is not None:
                            # Save audio to temporary file
                            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                                sf.write(
                                    tmp_file.name,
                                    audio.reshape(-1).detach().cpu().numpy(),
                                    samplerate=24000,
                                )
                                audio_path = tmp_file.name
                    else:
                        print("Warning: Unexpected generation output format")
                        # Fall back to text-only generation
                        text_ids = model.generate(
                            **inputs, 
                            use_audio_in_video=False,
                            return_audio=False,
                            max_new_tokens=512,
                            do_sample=True,
                            temperature=0.7,
                            top_p=0.9,
                            streamer=TextStreamer(processor, skip_prompt=True)
                        )
                except Exception as e:
                    print(f"Error during audio generation: {str(e)}")
                    # Fall back to text-only generation
                    try:
                        text_ids = model.generate(
                            **inputs, 
                            use_audio_in_video=False,
                            return_audio=False,
                            max_new_tokens=512,
                            do_sample=True,
                            temperature=0.7,
                            top_p=0.9,
                            streamer=TextStreamer(processor, skip_prompt=True)
                        )
                    except Exception as e:
                        print(f"Error during fallback text generation: {str(e)}")
                        text_ids = None
            else:
                try:
                    text_ids = model.generate(
                        **inputs, 
                        use_audio_in_video=False,
                        return_audio=False,
                        max_new_tokens=512,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        streamer=TextStreamer(processor, skip_prompt=True)
                    )
                except Exception as e:
                    print(f"Error during text generation: {str(e)}")
                    text_ids = None
            
            # Process the response
            if text_ids is not None and len(text_ids) > 0:
                try:
                    text_response = processor.batch_decode(
                        text_ids, 
                        skip_special_tokens=True, 
                        clean_up_tokenization_spaces=False
                    )[0]
                    
                    # Clean up text response
                    text_response = text_response.strip()
                    if "<|im_start|>assistant" in text_response:
                        text_response = text_response.split("<|im_start|>assistant")[-1]
                    text_response = text_response.replace("<|im_end|>", "").replace("<|im_start|>", "")
                    if text_response.startswith(":"):
                        text_response = text_response[1:].strip()
                except Exception as e:
                    print(f"Error during text decoding: {str(e)}")
                    text_response = "I apologize, but I encountered an error processing the response."
            else:
                text_response = "I apologize, but I encountered an error generating a response."
            
            # Update chat history with properly formatted entries
            if not isinstance(chat_history, list):
                chat_history = []
            
            # Convert the current messages to the proper format
            user_message = {"role": "user", "content": user_message_for_display}
            assistant_message = {"role": "assistant", "content": text_response}
            
            # Find the last incomplete message pair if it exists
            if chat_history and isinstance(chat_history[-1], dict) and chat_history[-1]["role"] == "user":
                chat_history.append(assistant_message)
            else:
                chat_history.extend([user_message, assistant_message])
            
            # Clear GPU memory after processing
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                gc.collect()
            
            # Prepare output
            if enable_audio_output and audio_path:
                return chat_history, text_response, audio_path
            else:
                return chat_history, text_response, None
                
        except Exception as e:
            print(f"Error during generation: {str(e)}")
            error_msg = "I apologize, but I encountered an error processing your request. Please try again."
            chat_history.append(
                {"role": "assistant", "content": error_msg}
            )
            return chat_history, error_msg, None
            
    except Exception as e:
        print(f"Error in process_input: {str(e)}")
        if not isinstance(chat_history, list):
            chat_history = []
        error_msg = "I apologize, but I encountered an error processing your request. Please try again."
        chat_history.extend([
            {"role": "user", "content": user_message_for_display},
            {"role": "assistant", "content": error_msg}
        ])
        return chat_history, error_msg, None

def user_input_to_content(user_input):
    if isinstance(user_input, str):
        return user_input
    elif isinstance(user_input, dict):
        # Handle file uploads
        content = []
        if "text" in user_input and user_input["text"]:
            content.append({"type": "text", "text": user_input["text"]})
        if "image" in user_input and user_input["image"]:
            content.append({"type": "image", "image": user_input["image"]})
        if "audio" in user_input and user_input["audio"]:
            content.append({"type": "audio", "audio": user_input["audio"]})
        if "video" in user_input and user_input["video"]:
            content.append({"type": "video", "video": user_input["video"]})
        return content
    return user_input

def create_demo():
    with gr.Blocks(title="Qwen2.5-Omni Chat Demo", theme=gr.themes.Soft()) as demo:
        gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
        gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
        
        # Hidden placeholder components for text-only input
        placeholder_image = gr.Image(type="filepath", visible=False)
        placeholder_audio = gr.Audio(type="filepath", visible=False)
        placeholder_video = gr.Video(visible=False)
        
        # Chat interface
        with gr.Row():
            with gr.Column(scale=3):
                chatbot = gr.Chatbot(
                    height=600,
                    show_label=False,
                    avatar_images=["user.png", "assistant.png"],
                    type="messages"
                )
                with gr.Accordion("Advanced Options", open=False):
                    voice_type = gr.Dropdown(
                        choices=list(VOICE_OPTIONS.keys()),
                        value="Chelsie (Female)",
                        label="Voice Type"
                    )
                    enable_audio_output = gr.Checkbox(
                        value=True,
                        label="Enable Audio Output"
                    )
                
                # Multimodal input components
                with gr.Tabs():
                    with gr.TabItem("Text Input"):
                        text_input = gr.Textbox(
                            placeholder="Type your message here...",
                            label="Text Input",
                            autofocus=True,
                            container=False,
                        )
                        text_submit = gr.Button("Send Text", variant="primary")
                    
                    with gr.TabItem("Multimodal Input"):
                        with gr.Row():
                            image_input = gr.Image(
                                type="filepath",
                                label="Upload Image"
                            )
                            audio_input = gr.Audio(
                                type="filepath",
                                label="Upload Audio"
                            )
                        with gr.Row():
                            video_input = gr.Video(
                                label="Upload Video"
                            )
                        additional_text = gr.Textbox(
                            placeholder="Additional text message...",
                            label="Additional Text",
                            container=False,
                        )
                        multimodal_submit = gr.Button("Send Multimodal Input", variant="primary")
                
                clear_button = gr.Button("Clear Chat")
                
            with gr.Column(scale=1):
                gr.Markdown("## Model Capabilities")
                gr.Markdown("""
                **Qwen2.5-Omni can:**
                - Process and understand text
                - Analyze images and answer questions about them
                - Transcribe and understand audio
                - Analyze video content (with or without audio)
                - Generate natural speech responses
                """)
                
                gr.Markdown("### Example Prompts")
                gr.Examples(
                    examples=[
                        ["Describe what you see in this image", "image"],
                        ["What is being said in this audio clip?", "audio"],
                        ["What's happening in this video?", "video"],
                        ["Explain quantum computing in simple terms", "text"],
                        ["Generate a short story about a robot learning to paint", "text"]
                    ],
                    inputs=[text_input, gr.Textbox(visible=False)],
                    label="Text Examples"
                )
                
                audio_output = gr.Audio(
                    label="Model Speech Output",
                    visible=True,
                    autoplay=True
                )
                text_output = gr.Textbox(
                    label="Model Text Response",
                    interactive=False
                )
        
        # Text input handling
        text_submit.click(
            fn=lambda text: [{"role": "user", "content": text if text is not None else ""}],
            inputs=text_input,
            outputs=[chatbot],
            queue=False
        ).then(
            fn=process_input,
            inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
            outputs=[chatbot, text_output, audio_output]
        ).then(
            fn=lambda: "",  # Clear input after submission
            outputs=text_input
        )
        
        # Multimodal input handling
        def prepare_multimodal_input(image, audio, video, text):
            # Create a display message that indicates what was uploaded
            display_message = str(text) if text is not None else ""
            if image is not None:
                display_message = (display_message + " " if display_message.strip() else "") + "[Image]"
            if audio is not None:
                display_message = (display_message + " " if display_message.strip() else "") + "[Audio]"
            if video is not None:
                display_message = (display_message + " " if display_message.strip() else "") + "[Video]"
            
            if not display_message.strip():
                display_message = "Multimodal content"
                
            return [{"role": "user", "content": display_message}]
        
        multimodal_submit.click(
            fn=prepare_multimodal_input,
            inputs=[image_input, audio_input, video_input, additional_text],
            outputs=[chatbot],
            queue=False
        ).then(
            fn=process_input,
            inputs=[image_input, audio_input, video_input, additional_text, 
                   chatbot, voice_type, enable_audio_output],
            outputs=[chatbot, text_output, audio_output]
        ).then(
            fn=lambda: (None, None, None, ""),  # Clear inputs after submission
            outputs=[image_input, audio_input, video_input, additional_text]
        )
        
        # Clear chat
        def clear_chat():
            return [], None, None
        
        clear_button.click(
            fn=clear_chat,
            outputs=[chatbot, text_output, audio_output]
        )
        
        # Update audio output visibility
        def toggle_audio_output(enable_audio):
            return gr.Audio(visible=enable_audio)
        
        enable_audio_output.change(
            fn=toggle_audio_output,
            inputs=enable_audio_output,
            outputs=audio_output
        )
    
    return demo

if __name__ == "__main__":
    demo = create_demo()
    demo.launch(server_name="0.0.0.0", server_port=7860)