Spaces:
Paused
Paused
File size: 9,349 Bytes
8e98e4c a27e746 8e98e4c 8f558df 21fcfe6 868ab7f 0e31dfe 21fcfe6 8f558df 21fcfe6 a533ef3 425e364 868ab7f a533ef3 868ab7f 95a4394 868ab7f 21fcfe6 885d899 21fcfe6 868ab7f 21fcfe6 c403d08 868ab7f c403d08 868ab7f c403d08 868ab7f c403d08 5dcca16 620ccea c403d08 620ccea c403d08 620ccea c403d08 dcf6d05 868ab7f dcf6d05 868ab7f c403d08 de5d173 dcf6d05 c403d08 dcf6d05 868ab7f 3d209d0 c403d08 dcf6d05 c403d08 dcf6d05 c403d08 dcf6d05 868ab7f c403d08 868ab7f c403d08 dcf6d05 868ab7f c403d08 868ab7f c403d08 868ab7f 8f558df c403d08 8f558df 868ab7f 8f558df 868ab7f c403d08 868ab7f c403d08 8f558df 21fcfe6 c403d08 868ab7f c403d08 868ab7f 8f558df 755339c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import os
os.environ["FORCE_QWENVL_VIDEO_READER"] = "decord"
import sys
print("Startup check: Using FORCE_QWENVL_VIDEO_READER=", os.environ.get("FORCE_QWENVL_VIDEO_READER"), file=sys.stderr)
sys.stderr.flush()
import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
import numpy as np
import os
from threading import Thread
import uuid
import io
# Model and Processor Loading (Done once at startup)
MODEL_ID = "omni-research/Tarsier2-Recap-7b"
model = Qwen2VLForConditionalGeneration.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=torch.float16
).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
DESCRIPTION = "Behavioral Video Analysis Demo"
image_extensions = Image.registered_extensions()
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")
def identify_and_save_blob(blob_path):
"""Identifies if the blob is an image or video and saves it accordingly."""
try:
with open(blob_path, 'rb') as file:
blob_content = file.read()
# Try to identify if it's an image
try:
Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image
extension = ".png" # Default to PNG for saving
media_type = "image"
except (IOError, SyntaxError):
# If it's not a valid image, assume it's a video
extension = ".mp4" # Default to MP4 for saving
media_type = "video"
# Create a unique filename
filename = f"temp_{uuid.uuid4()}_media{extension}"
with open(filename, "wb") as f:
f.write(blob_content)
return filename, media_type
except FileNotFoundError:
raise ValueError(f"The file {blob_path} was not found.")
except Exception as e:
raise ValueError(f"An error occurred while processing the file: {e}")
@spaces.GPU
def qwen_inference(media_input):
"""
We've removed the text_input parameter and switched to a
fixed prompt (hard-coded).
"""
# 1. Identify whether media_input is an image or video filepath
if isinstance(media_input, str): # If it's a filepath
media_path = media_input
if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
media_type = "image"
elif media_path.endswith(video_extensions):
media_type = "video"
else:
# If we don't recognize the file extension, try identify_and_save_blob
try:
media_path, media_type = identify_and_save_blob(media_input)
print(media_path, media_type)
except Exception as e:
print(e)
raise ValueError("Unsupported media type. Please upload an image or video.")
print(media_path)
# 2. Hard-code the text prompt here
fixed_prompt_text = """
Use the following typology to describe the behaviors of the child in the video
indicator_1 indicator_2 indicator_3 sr_no
Behavioral Category Holding Objects Holding two random objects, often simultaneously 1
Behavioral Category Holding Objects Persistent attachment to specific objects 2
Behavioral Category Eye Contact and Engagement Lack of eye contact or minimal eye engagement 3
Behavioral Category Eye Contact and Engagement Focus on objects rather than people during interaction 4
Behavioral Category Eye Contact and Engagement Unresponsive to name being called or other verbal cues 5
Behavioral Category Eye Contact and Engagement Limited back-and-forth gaze between people and objects 6
Behavioral Category Facial Expressions Flat or unexpressive face 7
Behavioral Category Facial Expressions Limited range of facial expressions 8
Behavioral Category Facial Expressions Occasional tense or grimacing facial posture 9
Behavioral Category Social Interaction Lack of shared enjoyment or visible emotional connection during interactions 10
Behavioral Category Social Interaction Disinterest in other people, even when they are engaging 11
Behavioral Category Social Interaction Inconsistent or no acknowledgment of social gestures like pointing 12
Movement and Gestures Repetitive Movements Hand flapping 13
Movement and Gestures Repetitive Movements Toe walking or bouncing on toes 14
Movement and Gestures Repetitive Movements Rocking back and forth, sometimes aggressively 15
Movement and Gestures Repetitive Movements Pacing or repetitive movements in a fixed area 16
Movement and Gestures Repetitive Movements Head shaking side to side 17
Movement and Gestures Repetitive Movements Spinning 18
Movement and Gestures Gestural Communication Using another person’s hand to point, request, or manipulate objects 19
Movement and Gestures Gestural Communication Nodding 20
Interaction with Toys and Objects Play Behavior Lining up toys or objects systematically, often by color or type 21
Interaction with Toys and Objects Play Behavior Stacking items like cans or blocks repeatedly 22
Interaction with Toys and Objects Play Behavior Fixation on spinning objects or wheels 23
Interaction with Toys and Objects Play Behavior Inspecting objects from unusual angles, such as sideways 24
Interaction with Toys and Objects Sensory Preferences Chewing or mouthing objects 25
Interaction with Toys and Objects Sensory Preferences Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy 26
Interaction with Toys and Objects Sensory Preferences Sensitivity to sounds, often covering ears 27
Interaction with Toys and Objects Sensory Preferences Visual inspection of objects up close or intensely 28
Gender and Developmental Nuances Gender-Based Masking Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent 29
Gender and Developmental Nuances Gender-Based Masking Girls may demonstrate learned emotional and social responses that obscure typical signs 30
Gender and Developmental Nuances Developmental Indicators Delays or atypical development in social communication and interaction milestones 31
Gender and Developmental Nuances Developmental Indicators Difficulty with back-and-forth conversation or social reciprocity 32
Your output should be a list of only the indicators that were observed in the video. Do not include any indicators for which evidence is low or non-existent
"""
# 3. Construct the messages with your fixed text
messages = [
{
"role": "user",
"content": [
{
"type": media_type,
media_type: media_path,
# Set any additional keys for video processing:
**({"nframes": 128, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
},
{
"type": "text",
"text": fixed_prompt_text
},
],
}
]
print("DEBUG MESSAGES:", messages)
# 4. Prepare the text prompt for the Qwen2-VL model
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# 5. Prepare the image/video data
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
# 6. Streaming output
streamer = TextIteratorStreamer(
processor,
skip_prompt=True,
**{"skip_special_tokens": True}
)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
# 7. Launch generation in separate thread for streaming
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# 8. Stream partial outputs back
buffer = ""
for new_text in streamer:
buffer += new_text
yield buffer
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Image/Video Input"):
with gr.Row():
with gr.Column():
input_media = gr.File(
label="Upload Image or Video",
type="filepath"
)
# 1) Remove the text_input box
# text_input = gr.Textbox(label="Question") # removed
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
# 2) qwen_inference is now called with just the media input
submit_btn.click(
qwen_inference,
[input_media], # no text_input argument
[output_text]
)
demo.launch(debug=True) |