import contextlib import subprocess import time from typing import Iterator, Callable import gradio as gr import spaces from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from qwen_vl_utils import process_vision_info subprocess.run( "pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True, ) # Mapping user-friendly names to HF model IDs MODEL_NAMES = { "Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct", } @contextlib.contextmanager def measure_time() -> Iterator[Callable[[], float]]: """ A context manager for measuring execution time (in seconds) within its code block. usage: with code_timer() as timer: # Code snippet to be timed print(f"Code took: {timer()} seconds") """ start_time = end_time = time.perf_counter() yield lambda: end_time - start_time end_time = time.perf_counter() @spaces.GPU(duration=300) def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id): """ Load the selected Qwen2.5-VL model and run inference on text, image, or video. """ model_id = MODEL_NAMES[model_key] model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_id, torch_dtype="auto", device_map="auto", ) processor = AutoProcessor.from_pretrained(model_id) # Text-only inference if input_type == "text": inputs = processor( text=text, return_tensors="pt", padding=True ) inputs = inputs.to(model.device) outputs = model.generate(**inputs, max_new_tokens=512) return processor.batch_decode(outputs, skip_special_tokens=True)[0] # Multimodal inference (image or video) content = [] if input_type == "image" and image: for img_path in image: content.append({"type": "image", "image": img_path}) elif input_type == "video" and video: # Ensure file URI for local files video_src = video if str(video).startswith("file://") else f"file://{video}" content.append({"type": "video", "video": video_src, "fps": fps}) content.append({"type": "text", "text": text or ""}) msg = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": content} ] # Prepare inputs for model with video kwargs text_prompt = processor.apply_chat_template( msg, tokenize=False, add_generation_prompt=True, add_vision_id=add_vision_id ) image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True) inputs = processor( text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", **video_kwargs ) inputs = inputs.to(model.device) with measure_time() as timer: gen_ids = model.generate(**inputs, max_new_tokens=512) # Trim the prompt tokens trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)] result = processor.batch_decode(trimmed, skip_special_tokens=True)[0] gr.Info(f"Finished in {timer():.2f}s", title="Success", duration=5) # green-style info toast :contentReference[oaicite:0]{index=0} return result # Build Gradio interface demo = gr.Blocks() with demo: gr.Markdown("# Qwen2.5-VL Multimodal Demo") model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model") input_type = gr.Radio(["text", "image", "video"], label="Input Type") system_prompt_input = gr.Textbox( lines=2, placeholder="System prompt…", value="You are a helpful assistant.", label="System Prompt" ) vision_id_checkbox = gr.Checkbox( label="Add vision ID", value=False ) text_input = gr.Textbox( lines=3, placeholder="Enter text ...", visible=True ) image_input = gr.File( file_count="multiple", file_types=["image"], label="Upload Images", visible=False ) video_input = gr.Video(visible=False) fps_input = gr.Number( value=2.0, label="FPS", visible=False ) output = gr.Textbox(label="Output") # Show/hide inputs based on selection def update_inputs(choice): return ( gr.update(visible=True), gr.update(visible=(choice == "image")), gr.update(visible=(choice == "video")), gr.update(visible=(choice == "video")) ) input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input]) run_btn = gr.Button("Generate") run_btn.click( run_inference, [ model_select, input_type, text_input, image_input, video_input, fps_input, system_prompt_input, vision_id_checkbox ], output ) # Launch the app if __name__ == "__main__": demo.launch()