Qwen2-VL-2B

Paused

App Files Files Community

Qwen2-VL-2B / app.py

omer-bhutta

Update app.py

95a4394 verified 2 months ago

raw

history blame contribute delete

9.35 kB

	import os
	os.environ["FORCE_QWENVL_VIDEO_READER"] = "decord"

	import sys
	print("Startup check: Using FORCE_QWENVL_VIDEO_READER=", os.environ.get("FORCE_QWENVL_VIDEO_READER"), file=sys.stderr)
	sys.stderr.flush()


	import gradio as gr
	import spaces
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
	from qwen_vl_utils import process_vision_info
	import torch
	from PIL import Image
	import subprocess
	import numpy as np
	import os
	from threading import Thread
	import uuid
	import io

	# Model and Processor Loading (Done once at startup)
	MODEL_ID = "omni-research/Tarsier2-Recap-7b"
	model = Qwen2VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	torch_dtype=torch.float16
	).to("cuda").eval()
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

	DESCRIPTION = "Behavioral Video Analysis Demo"

	image_extensions = Image.registered_extensions()
	video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")


	def identify_and_save_blob(blob_path):
	"""Identifies if the blob is an image or video and saves it accordingly."""
	try:
	with open(blob_path, 'rb') as file:
	blob_content = file.read()

	# Try to identify if it's an image
	try:
	Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
	extension = ".png" # Default to PNG for saving
	media_type = "image"
	except (IOError, SyntaxError):
	# If it's not a valid image, assume it's a video
	extension = ".mp4" # Default to MP4 for saving
	media_type = "video"

	# Create a unique filename
	filename = f"temp_{uuid.uuid4()}_media{extension}"
	with open(filename, "wb") as f:
	f.write(blob_content)

	return filename, media_type

	except FileNotFoundError:
	raise ValueError(f"The file {blob_path} was not found.")
	except Exception as e:
	raise ValueError(f"An error occurred while processing the file: {e}")


	@spaces.GPU
	def qwen_inference(media_input):
	"""
	We've removed the text_input parameter and switched to a
	fixed prompt (hard-coded).
	"""

	# 1. Identify whether media_input is an image or video filepath
	if isinstance(media_input, str): # If it's a filepath
	media_path = media_input
	if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
	media_type = "image"
	elif media_path.endswith(video_extensions):
	media_type = "video"
	else:
	# If we don't recognize the file extension, try identify_and_save_blob
	try:
	media_path, media_type = identify_and_save_blob(media_input)
	print(media_path, media_type)
	except Exception as e:
	print(e)
	raise ValueError("Unsupported media type. Please upload an image or video.")

	print(media_path)

	# 2. Hard-code the text prompt here
	fixed_prompt_text = """

	Use the following typology to describe the behaviors of the child in the video

	indicator_1 indicator_2 indicator_3 sr_no
	Behavioral Category Holding Objects Holding two random objects, often simultaneously 1
	Behavioral Category Holding Objects Persistent attachment to specific objects 2
	Behavioral Category Eye Contact and Engagement Lack of eye contact or minimal eye engagement 3
	Behavioral Category Eye Contact and Engagement Focus on objects rather than people during interaction 4
	Behavioral Category Eye Contact and Engagement Unresponsive to name being called or other verbal cues 5
	Behavioral Category Eye Contact and Engagement Limited back-and-forth gaze between people and objects 6
	Behavioral Category Facial Expressions Flat or unexpressive face 7
	Behavioral Category Facial Expressions Limited range of facial expressions 8
	Behavioral Category Facial Expressions Occasional tense or grimacing facial posture 9
	Behavioral Category Social Interaction Lack of shared enjoyment or visible emotional connection during interactions 10
	Behavioral Category Social Interaction Disinterest in other people, even when they are engaging 11
	Behavioral Category Social Interaction Inconsistent or no acknowledgment of social gestures like pointing 12
	Movement and Gestures Repetitive Movements Hand flapping 13
	Movement and Gestures Repetitive Movements Toe walking or bouncing on toes 14
	Movement and Gestures Repetitive Movements Rocking back and forth, sometimes aggressively 15
	Movement and Gestures Repetitive Movements Pacing or repetitive movements in a fixed area 16
	Movement and Gestures Repetitive Movements Head shaking side to side 17
	Movement and Gestures Repetitive Movements Spinning 18
	Movement and Gestures Gestural Communication Using another person’s hand to point, request, or manipulate objects 19
	Movement and Gestures Gestural Communication Nodding 20
	Interaction with Toys and Objects Play Behavior Lining up toys or objects systematically, often by color or type 21
	Interaction with Toys and Objects Play Behavior Stacking items like cans or blocks repeatedly 22
	Interaction with Toys and Objects Play Behavior Fixation on spinning objects or wheels 23
	Interaction with Toys and Objects Play Behavior Inspecting objects from unusual angles, such as sideways 24
	Interaction with Toys and Objects Sensory Preferences Chewing or mouthing objects 25
	Interaction with Toys and Objects Sensory Preferences Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy 26
	Interaction with Toys and Objects Sensory Preferences Sensitivity to sounds, often covering ears 27
	Interaction with Toys and Objects Sensory Preferences Visual inspection of objects up close or intensely 28
	Gender and Developmental Nuances Gender-Based Masking Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent 29
	Gender and Developmental Nuances Gender-Based Masking Girls may demonstrate learned emotional and social responses that obscure typical signs 30
	Gender and Developmental Nuances Developmental Indicators Delays or atypical development in social communication and interaction milestones 31
	Gender and Developmental Nuances Developmental Indicators Difficulty with back-and-forth conversation or social reciprocity 32

	Your output should be a list of only the indicators that were observed in the video. Do not include any indicators for which evidence is low or non-existent
	"""

	# 3. Construct the messages with your fixed text
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": media_type,
	media_type: media_path,
	# Set any additional keys for video processing:
	**({"nframes": 128, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
	},
	{
	"type": "text",
	"text": fixed_prompt_text
	},
	],
	}
	]

	print("DEBUG MESSAGES:", messages)

	# 4. Prepare the text prompt for the Qwen2-VL model
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# 5. Prepare the image/video data
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to("cuda")

	# 6. Streaming output
	streamer = TextIteratorStreamer(
	processor,
	skip_prompt=True,
	**{"skip_special_tokens": True}
	)
	generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

	# 7. Launch generation in separate thread for streaming
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# 8. Stream partial outputs back
	buffer = ""
	for new_text in streamer:
	buffer += new_text
	yield buffer



	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Tab(label="Image/Video Input"):
	with gr.Row():
	with gr.Column():
	input_media = gr.File(
	label="Upload Image or Video",
	type="filepath"
	)
	# 1) Remove the text_input box
	# text_input = gr.Textbox(label="Question") # removed

	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	# 2) qwen_inference is now called with just the media input
	submit_btn.click(
	qwen_inference,
	[input_media], # no text_input argument
	[output_text]
	)

	demo.launch(debug=True)