Spaces:

ankandrew
/

Qwen2.5VL

Running on Zero

Qwen2.5VL / app.py

ankandrew

Tweaks

b3d5d95 about 1 month ago

4.76 kB

	import subprocess
	import gradio as gr
	import spaces
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info
	from transformers.utils import is_flash_attn_2_available

	subprocess.run(
	"pip install flash-attn --no-build-isolation",
	env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	shell=True,
	)

	# Mapping user-friendly names to HF model IDs
	MODEL_NAMES = {
	"Qwen2.5-VL-7B-Instruct-AWQ": "Qwen/Qwen2.5-VL-7B-Instruct-AWQ",
	"Qwen2.5-VL-3B-Instruct-AWQ": "Qwen/Qwen2.5-VL-3B-Instruct-AWQ",
	"Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
	"Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
	}


	@spaces.GPU(duration=300)
	def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
	"""
	Load the selected Qwen2.5-VL model and run inference on text, image, or video.
	"""
	model_id = MODEL_NAMES[model_key]
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype="auto",
	device_map="auto",
	attn_implementation="flash_attention_2" if is_flash_attn_2_available() else None,
	)
	processor = AutoProcessor.from_pretrained(model_id)

	# Text-only inference
	if input_type == "text":
	inputs = processor(
	text=text,
	return_tensors="pt",
	padding=True
	)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=512)
	return processor.batch_decode(outputs, skip_special_tokens=True)[0]

	# Multimodal inference (image or video)
	content = []
	if input_type == "image" and image:
	content.append({"type": "image", "image": image})
	elif input_type == "video" and video:
	# Ensure file URI for local files
	video_src = video if str(video).startswith("file://") else f"file://{video}"
	content.append({"type": "video", "video": video_src, "fps": fps})
	content.append({"type": "text", "text": text or ""})
	msg = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": content}
	]

	# Prepare inputs for model with video kwargs
	text_prompt = processor.apply_chat_template(
	msg,
	tokenize=False,
	add_generation_prompt=True,
	add_vision_id=add_vision_id
	)
	image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
	inputs = processor(
	text=[text_prompt],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	**video_kwargs
	)
	inputs = inputs.to(model.device)

	gen_ids = model.generate(**inputs, max_new_tokens=512)
	# Trim the prompt tokens
	trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
	return processor.batch_decode(trimmed, skip_special_tokens=True)[0]


	# Build Gradio interface
	demo = gr.Blocks()
	with demo:
	gr.Markdown("# Qwen2.5-VL Multimodal Demo")
	model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
	input_type = gr.Radio(["text", "image", "video"], label="Input Type")
	system_prompt_input = gr.Textbox(
	lines=2,
	placeholder="System prompt…",
	value="You are a helpful assistant.",
	label="System Prompt"
	)
	vision_id_checkbox = gr.Checkbox(
	label="Add vision ID",
	value=False
	)
	text_input = gr.Textbox(
	lines=3,
	placeholder="Enter text ...",
	visible=True
	)
	image_input = gr.File(
	file_count="multiple",
	file_types=["image"],
	label="Upload Images",
	visible=False
	)
	video_input = gr.Video(visible=False)
	fps_input = gr.Number(
	value=2.0,
	label="FPS",
	visible=False
	)
	output = gr.Textbox(label="Output")

	# Show/hide inputs based on selection
	def update_inputs(choice):
	return (
	gr.update(visible=True),
	gr.update(visible=(choice == "image")),
	gr.update(visible=(choice == "video")),
	gr.update(visible=(choice == "video"))
	)

	input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
	run_btn = gr.Button("Generate")
	run_btn.click(
	run_inference,
	[
	model_select,
	input_type,
	text_input,
	image_input,
	video_input,
	fps_input,
	system_prompt_input,
	vision_id_checkbox
	],
	output
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()