File size: 4,755 Bytes
2e3ddd8 d317ae4 f0c7145 2e3ddd8 c5c055b f0c7145 2e3ddd8 d317ae4 2e3ddd8 b3d5d95 2e3ddd8 c5c055b 2e3ddd8 b3d5d95 2e3ddd8 b3d5d95 2e3ddd8 b3d5d95 cd47744 b3d5d95 2e3ddd8 b3d5d95 2e3ddd8 b3d5d95 2e3ddd8 f0c7145 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import subprocess
import gradio as gr
import spaces
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers.utils import is_flash_attn_2_available
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
# Mapping user-friendly names to HF model IDs
MODEL_NAMES = {
"Qwen2.5-VL-7B-Instruct-AWQ": "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
"Qwen2.5-VL-3B-Instruct-AWQ": "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
"Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
"Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
}
@spaces.GPU(duration=300)
def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
"""
Load the selected Qwen2.5-VL model and run inference on text, image, or video.
"""
model_id = MODEL_NAMES[model_key]
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto",
attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
)
processor = AutoProcessor.from_pretrained(model_id)
# Text-only inference
if input_type == "text":
inputs = processor(
text=text,
return_tensors="pt",
padding=True
)
inputs = inputs.to(model.device)
outputs = model.generate(**inputs, max_new_tokens=512)
return processor.batch_decode(outputs, skip_special_tokens=True)[0]
# Multimodal inference (image or video)
content = []
if input_type == "image" and image:
content.append({"type": "image", "image": image})
elif input_type == "video" and video:
# Ensure file URI for local files
video_src = video if str(video).startswith("file://") else f"file://{video}"
content.append({"type": "video", "video": video_src, "fps": fps})
content.append({"type": "text", "text": text or ""})
msg = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": content}
]
# Prepare inputs for model with video kwargs
text_prompt = processor.apply_chat_template(
msg,
tokenize=False,
add_generation_prompt=True,
add_vision_id=add_vision_id
)
image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
inputs = processor(
text=[text_prompt],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
**video_kwargs
)
inputs = inputs.to(model.device)
gen_ids = model.generate(**inputs, max_new_tokens=512)
# Trim the prompt tokens
trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
return processor.batch_decode(trimmed, skip_special_tokens=True)[0]
# Build Gradio interface
demo = gr.Blocks()
with demo:
gr.Markdown("# Qwen2.5-VL Multimodal Demo")
model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
input_type = gr.Radio(["text", "image", "video"], label="Input Type")
system_prompt_input = gr.Textbox(
lines=2,
placeholder="System prompt…",
value="You are a helpful assistant.",
label="System Prompt"
)
vision_id_checkbox = gr.Checkbox(
label="Add vision ID",
value=False
)
text_input = gr.Textbox(
lines=3,
placeholder="Enter text ...",
visible=True
)
image_input = gr.File(
file_count="multiple",
file_types=["image"],
label="Upload Images",
visible=False
)
video_input = gr.Video(visible=False)
fps_input = gr.Number(
value=2.0,
label="FPS",
visible=False
)
output = gr.Textbox(label="Output")
# Show/hide inputs based on selection
def update_inputs(choice):
return (
gr.update(visible=True),
gr.update(visible=(choice == "image")),
gr.update(visible=(choice == "video")),
gr.update(visible=(choice == "video"))
)
input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
run_btn = gr.Button("Generate")
run_btn.click(
run_inference,
[
model_select,
input_type,
text_input,
image_input,
video_input,
fps_input,
system_prompt_input,
vision_id_checkbox
],
output
)
# Launch the app
if __name__ == "__main__":
demo.launch()
|