ankandrew commited on
Commit
2e3ddd8
·
1 Parent(s): f0c7145

Update gradio demo

Browse files
Files changed (1) hide show
  1. app.py +105 -5
app.py CHANGED
@@ -1,11 +1,111 @@
 
1
  import gradio as gr
2
  import spaces
 
 
3
 
4
 
5
- @spaces.GPU
6
- def greet(name):
7
- return "Hello " + name + "!!"
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
11
- demo.launch()
 
1
+ import subprocess
2
  import gradio as gr
3
  import spaces
4
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
5
+ from qwen_vl_utils import process_vision_info
6
 
7
 
8
+ subprocess.run(
9
+ "pip install flash-attn --no-build-isolation",
10
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
11
+ shell=True,
12
+ )
13
 
14
+ # Mapping user-friendly names to HF model IDs
15
+ MODEL_NAMES = {
16
+ "Qwen2.5-VL-7B-Instruct-AWQ": "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
17
+ "Qwen2.5-VL-3B-Instruct-AWQ": "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
18
+ "Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
19
+ "Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
20
+ }
21
+
22
+
23
+ @spaces.GPU(duration=300)
24
+ def run_inference(model_key, input_type, text, image, video, fps):
25
+ """
26
+ Load the selected Qwen2.5-VL model and run inference on text, image, or video.
27
+ """
28
+ model_id = MODEL_NAMES[model_key]
29
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
30
+ model_id,
31
+ torch_dtype="auto",
32
+ device_map="auto"
33
+ )
34
+ processor = AutoProcessor.from_pretrained(model_id)
35
+
36
+ # Text-only inference
37
+ if input_type == "text":
38
+ inputs = processor(
39
+ text=text,
40
+ return_tensors="pt",
41
+ padding=True
42
+ )
43
+ inputs = inputs.to(model.device)
44
+ outputs = model.generate(**inputs, max_new_tokens=512)
45
+ return processor.batch_decode(outputs, skip_special_tokens=True)[0]
46
+
47
+ # Multimodal inference (image or video)
48
+ content = []
49
+ if input_type == "image" and image:
50
+ content.append({"type": "image", "image": image})
51
+ elif input_type == "video" and video:
52
+ # Ensure file URI for local files
53
+ video_src = video if str(video).startswith("file://") else f"file://{video}"
54
+ content.append({"type": "video", "video": video_src, "fps": fps})
55
+ content.append({"type": "text", "text": text or ""})
56
+ msg = [{"role": "user", "content": content}]
57
+
58
+ # Prepare inputs for model with video kwargs
59
+ text_prompt = processor.apply_chat_template(
60
+ msg, tokenize=False, add_generation_prompt=True
61
+ )
62
+ image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
63
+ inputs = processor(
64
+ text=[text_prompt],
65
+ images=image_inputs,
66
+ videos=video_inputs,
67
+ padding=True,
68
+ return_tensors="pt",
69
+ **video_kwargs
70
+ )
71
+ inputs = inputs.to(model.device)
72
+
73
+ gen_ids = model.generate(**inputs, max_new_tokens=512)
74
+ # Trim the prompt tokens
75
+ trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
76
+ return processor.batch_decode(trimmed, skip_special_tokens=True)[0]
77
+
78
+
79
+ # Build Gradio interface
80
+ demo = gr.Blocks()
81
+ with demo:
82
+ gr.Markdown("# Qwen2.5-VL Multimodal Demo")
83
+ model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
84
+ input_type = gr.Radio(["text", "image", "video"], label="Input Type")
85
+ text_input = gr.Textbox(lines=3, placeholder="Enter text...", visible=True)
86
+ image_input = gr.Image(type="filepath", visible=False)
87
+ video_input = gr.Video(type="filepath", visible=False)
88
+ fps_input = gr.Slider(minimum=0.1, maximum=30.0, step=0.1, value=2.0, label="FPS", visible=False)
89
+ output = gr.Textbox(label="Output")
90
+
91
+ # Show/hide inputs based on selection
92
+ def update_inputs(choice):
93
+ return (
94
+ gr.update(visible=(choice == "text")),
95
+ gr.update(visible=(choice == "image")),
96
+ gr.update(visible=(choice == "video")),
97
+ gr.update(visible=(choice == "video"))
98
+ )
99
+
100
+ input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
101
+ run_btn = gr.Button("Generate")
102
+ run_btn.click(
103
+ run_inference,
104
+ [model_select, input_type, text_input, image_input, video_input, fps_input],
105
+ output
106
+ )
107
+
108
+ # Launch the app
109
+ if __name__ == "__main__":
110
+ demo.launch()
111