Spaces:

ankandrew
/

Qwen2.5VL

Running on Zero

Qwen2.5VL / app.py

ankandrew

Remove AWQ models

6a97c01 16 days ago

5.23 kB

	import contextlib
	import subprocess
	import time
	from typing import Iterator, Callable

	import gradio as gr
	import spaces
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info

	subprocess.run(
	"pip install flash-attn --no-build-isolation",
	env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	shell=True,
	)

	# Mapping user-friendly names to HF model IDs
	MODEL_NAMES = {
	"Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
	"Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
	}


	@contextlib.contextmanager
	def measure_time() -> Iterator[Callable[[], float]]:
	"""
	A context manager for measuring execution time (in seconds) within its code block.

	usage:
	with code_timer() as timer:
	# Code snippet to be timed
	print(f"Code took: {timer()} seconds")
	"""
	start_time = end_time = time.perf_counter()
	yield lambda: end_time - start_time
	end_time = time.perf_counter()


	@spaces.GPU(duration=300)
	def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
	"""
	Load the selected Qwen2.5-VL model and run inference on text, image, or video.
	"""
	model_id = MODEL_NAMES[model_key]
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype="auto",
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(model_id)

	# Text-only inference
	if input_type == "text":
	inputs = processor(
	text=text,
	return_tensors="pt",
	padding=True
	)
	inputs = inputs.to(model.device)
	outputs = model.generate(**inputs, max_new_tokens=512)
	return processor.batch_decode(outputs, skip_special_tokens=True)[0]

	# Multimodal inference (image or video)
	content = []
	if input_type == "image" and image:
	for img_path in image:
	content.append({"type": "image", "image": img_path})
	elif input_type == "video" and video:
	# Ensure file URI for local files
	video_src = video if str(video).startswith("file://") else f"file://{video}"
	content.append({"type": "video", "video": video_src, "fps": fps})
	content.append({"type": "text", "text": text or ""})
	msg = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": content}
	]

	# Prepare inputs for model with video kwargs
	text_prompt = processor.apply_chat_template(
	msg,
	tokenize=False,
	add_generation_prompt=True,
	add_vision_id=add_vision_id
	)
	image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
	inputs = processor(
	text=[text_prompt],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	**video_kwargs
	)
	inputs = inputs.to(model.device)

	with measure_time() as timer:
	gen_ids = model.generate(**inputs, max_new_tokens=512)
	# Trim the prompt tokens
	trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
	result = processor.batch_decode(trimmed, skip_special_tokens=True)[0]

	gr.Info(f"Finished in {timer():.2f}s", title="Success", duration=5) # green-style info toast :contentReference[oaicite:0]{index=0}
	return result


	# Build Gradio interface
	demo = gr.Blocks()
	with demo:
	gr.Markdown("# Qwen2.5-VL Multimodal Demo")
	model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
	input_type = gr.Radio(["text", "image", "video"], label="Input Type")
	system_prompt_input = gr.Textbox(
	lines=2,
	placeholder="System prompt…",
	value="You are a helpful assistant.",
	label="System Prompt"
	)
	vision_id_checkbox = gr.Checkbox(
	label="Add vision ID",
	value=False
	)
	text_input = gr.Textbox(
	lines=3,
	placeholder="Enter text ...",
	visible=True
	)
	image_input = gr.File(
	file_count="multiple",
	file_types=["image"],
	label="Upload Images",
	visible=False
	)
	video_input = gr.Video(visible=False)
	fps_input = gr.Number(
	value=2.0,
	label="FPS",
	visible=False
	)
	output = gr.Textbox(label="Output")

	# Show/hide inputs based on selection
	def update_inputs(choice):
	return (
	gr.update(visible=True),
	gr.update(visible=(choice == "image")),
	gr.update(visible=(choice == "video")),
	gr.update(visible=(choice == "video"))
	)

	input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
	run_btn = gr.Button("Generate")
	run_btn.click(
	run_inference,
	[
	model_select,
	input_type,
	text_input,
	image_input,
	video_input,
	fps_input,
	system_prompt_input,
	vision_id_checkbox
	],
	output
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()