Qwen2-VL-2B / app.py
omer-bhutta's picture
Update app.py
95a4394 verified
import os
os.environ["FORCE_QWENVL_VIDEO_READER"] = "decord"
import sys
print("Startup check: Using FORCE_QWENVL_VIDEO_READER=", os.environ.get("FORCE_QWENVL_VIDEO_READER"), file=sys.stderr)
sys.stderr.flush()
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
import numpy as np
import os
from threading import Thread
import uuid
import io
# Model and Processor Loading (Done once at startup)
MODEL_ID = "omni-research/Tarsier2-Recap-7b"
model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.float16
).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
DESCRIPTION = "Behavioral Video Analysis Demo"
image_extensions = Image.registered_extensions()
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
def identify_and_save_blob(blob_path):
"""Identifies if the blob is an image or video and saves it accordingly."""
try:
with open(blob_path, 'rb') as file:
blob_content = file.read()
# Try to identify if it's an image
try:
Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
extension = ".png" # Default to PNG for saving
media_type = "image"
except (IOError, SyntaxError):
# If it's not a valid image, assume it's a video
extension = ".mp4" # Default to MP4 for saving
media_type = "video"
# Create a unique filename
filename = f"temp_{uuid.uuid4()}_media{extension}"
with open(filename, "wb") as f:
f.write(blob_content)
return filename, media_type
except FileNotFoundError:
raise ValueError(f"The file {blob_path} was not found.")
except Exception as e:
raise ValueError(f"An error occurred while processing the file: {e}")
@spaces.GPU
def qwen_inference(media_input):
"""
We've removed the text_input parameter and switched to a
fixed prompt (hard-coded).
"""
# 1. Identify whether media_input is an image or video filepath
if isinstance(media_input, str): # If it's a filepath
media_path = media_input
if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
media_type = "image"
elif media_path.endswith(video_extensions):
media_type = "video"
else:
# If we don't recognize the file extension, try identify_and_save_blob
try:
media_path, media_type = identify_and_save_blob(media_input)
print(media_path, media_type)
except Exception as e:
print(e)
raise ValueError("Unsupported media type. Please upload an image or video.")
print(media_path)
# 2. Hard-code the text prompt here
fixed_prompt_text = """
Use the following typology to describe the behaviors of the child in the video
indicator_1 indicator_2 indicator_3 sr_no
Behavioral Category Holding Objects Holding two random objects, often simultaneously 1
Behavioral Category Holding Objects Persistent attachment to specific objects 2
Behavioral Category Eye Contact and Engagement Lack of eye contact or minimal eye engagement 3
Behavioral Category Eye Contact and Engagement Focus on objects rather than people during interaction 4
Behavioral Category Eye Contact and Engagement Unresponsive to name being called or other verbal cues 5
Behavioral Category Eye Contact and Engagement Limited back-and-forth gaze between people and objects 6
Behavioral Category Facial Expressions Flat or unexpressive face 7
Behavioral Category Facial Expressions Limited range of facial expressions 8
Behavioral Category Facial Expressions Occasional tense or grimacing facial posture 9
Behavioral Category Social Interaction Lack of shared enjoyment or visible emotional connection during interactions 10
Behavioral Category Social Interaction Disinterest in other people, even when they are engaging 11
Behavioral Category Social Interaction Inconsistent or no acknowledgment of social gestures like pointing 12
Movement and Gestures Repetitive Movements Hand flapping 13
Movement and Gestures Repetitive Movements Toe walking or bouncing on toes 14
Movement and Gestures Repetitive Movements Rocking back and forth, sometimes aggressively 15
Movement and Gestures Repetitive Movements Pacing or repetitive movements in a fixed area 16
Movement and Gestures Repetitive Movements Head shaking side to side 17
Movement and Gestures Repetitive Movements Spinning 18
Movement and Gestures Gestural Communication Using another person’s hand to point, request, or manipulate objects 19
Movement and Gestures Gestural Communication Nodding 20
Interaction with Toys and Objects Play Behavior Lining up toys or objects systematically, often by color or type 21
Interaction with Toys and Objects Play Behavior Stacking items like cans or blocks repeatedly 22
Interaction with Toys and Objects Play Behavior Fixation on spinning objects or wheels 23
Interaction with Toys and Objects Play Behavior Inspecting objects from unusual angles, such as sideways 24
Interaction with Toys and Objects Sensory Preferences Chewing or mouthing objects 25
Interaction with Toys and Objects Sensory Preferences Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy 26
Interaction with Toys and Objects Sensory Preferences Sensitivity to sounds, often covering ears 27
Interaction with Toys and Objects Sensory Preferences Visual inspection of objects up close or intensely 28
Gender and Developmental Nuances Gender-Based Masking Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent 29
Gender and Developmental Nuances Gender-Based Masking Girls may demonstrate learned emotional and social responses that obscure typical signs 30
Gender and Developmental Nuances Developmental Indicators Delays or atypical development in social communication and interaction milestones 31
Gender and Developmental Nuances Developmental Indicators Difficulty with back-and-forth conversation or social reciprocity 32
Your output should be a list of only the indicators that were observed in the video. Do not include any indicators for which evidence is low or non-existent
"""
# 3. Construct the messages with your fixed text
messages = [
{
"role": "user",
"content": [
{
"type": media_type,
media_type: media_path,
# Set any additional keys for video processing:
**({"nframes": 128, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
},
{
"type": "text",
"text": fixed_prompt_text
},
],
}
]
print("DEBUG MESSAGES:", messages)
# 4. Prepare the text prompt for the Qwen2-VL model
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# 5. Prepare the image/video data
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
# 6. Streaming output
streamer = TextIteratorStreamer(
processor,
skip_prompt=True,
**{"skip_special_tokens": True}
)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
# 7. Launch generation in separate thread for streaming
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# 8. Stream partial outputs back
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Image/Video Input"):
with gr.Row():
with gr.Column():
input_media = gr.File(
label="Upload Image or Video",
type="filepath"
)
# 1) Remove the text_input box
# text_input = gr.Textbox(label="Question") # removed
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
# 2) qwen_inference is now called with just the media input
submit_btn.click(
qwen_inference,
[input_media], # no text_input argument
[output_text]
)
demo.launch(debug=True)