import os os.environ["FORCE_QWENVL_VIDEO_READER"] = "decord" import sys print("Startup check: Using FORCE_QWENVL_VIDEO_READER=", os.environ.get("FORCE_QWENVL_VIDEO_READER"), file=sys.stderr) sys.stderr.flush() import gradio as gr import spaces from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer from qwen_vl_utils import process_vision_info import torch from PIL import Image import subprocess import numpy as np import os from threading import Thread import uuid import io # Model and Processor Loading (Done once at startup) MODEL_ID = "omni-research/Tarsier2-Recap-7b" model = Qwen2VLForConditionalGeneration.from_pretrained( MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16 ).to("cuda").eval() processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) DESCRIPTION = "Behavioral Video Analysis Demo" image_extensions = Image.registered_extensions() video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp") def identify_and_save_blob(blob_path): """Identifies if the blob is an image or video and saves it accordingly.""" try: with open(blob_path, 'rb') as file: blob_content = file.read() # Try to identify if it's an image try: Image.open(io.BytesIO(blob_content)).verify() # Check if it's a valid image extension = ".png" # Default to PNG for saving media_type = "image" except (IOError, SyntaxError): # If it's not a valid image, assume it's a video extension = ".mp4" # Default to MP4 for saving media_type = "video" # Create a unique filename filename = f"temp_{uuid.uuid4()}_media{extension}" with open(filename, "wb") as f: f.write(blob_content) return filename, media_type except FileNotFoundError: raise ValueError(f"The file {blob_path} was not found.") except Exception as e: raise ValueError(f"An error occurred while processing the file: {e}") @spaces.GPU def qwen_inference(media_input): """ We've removed the text_input parameter and switched to a fixed prompt (hard-coded). """ # 1. Identify whether media_input is an image or video filepath if isinstance(media_input, str): # If it's a filepath media_path = media_input if media_path.endswith(tuple([i for i, f in image_extensions.items()])): media_type = "image" elif media_path.endswith(video_extensions): media_type = "video" else: # If we don't recognize the file extension, try identify_and_save_blob try: media_path, media_type = identify_and_save_blob(media_input) print(media_path, media_type) except Exception as e: print(e) raise ValueError("Unsupported media type. Please upload an image or video.") print(media_path) # 2. Hard-code the text prompt here fixed_prompt_text = """ Use the following typology to describe the behaviors of the child in the video indicator_1 indicator_2 indicator_3 sr_no Behavioral Category Holding Objects Holding two random objects, often simultaneously 1 Behavioral Category Holding Objects Persistent attachment to specific objects 2 Behavioral Category Eye Contact and Engagement Lack of eye contact or minimal eye engagement 3 Behavioral Category Eye Contact and Engagement Focus on objects rather than people during interaction 4 Behavioral Category Eye Contact and Engagement Unresponsive to name being called or other verbal cues 5 Behavioral Category Eye Contact and Engagement Limited back-and-forth gaze between people and objects 6 Behavioral Category Facial Expressions Flat or unexpressive face 7 Behavioral Category Facial Expressions Limited range of facial expressions 8 Behavioral Category Facial Expressions Occasional tense or grimacing facial posture 9 Behavioral Category Social Interaction Lack of shared enjoyment or visible emotional connection during interactions 10 Behavioral Category Social Interaction Disinterest in other people, even when they are engaging 11 Behavioral Category Social Interaction Inconsistent or no acknowledgment of social gestures like pointing 12 Movement and Gestures Repetitive Movements Hand flapping 13 Movement and Gestures Repetitive Movements Toe walking or bouncing on toes 14 Movement and Gestures Repetitive Movements Rocking back and forth, sometimes aggressively 15 Movement and Gestures Repetitive Movements Pacing or repetitive movements in a fixed area 16 Movement and Gestures Repetitive Movements Head shaking side to side 17 Movement and Gestures Repetitive Movements Spinning 18 Movement and Gestures Gestural Communication Using another person’s hand to point, request, or manipulate objects 19 Movement and Gestures Gestural Communication Nodding 20 Interaction with Toys and Objects Play Behavior Lining up toys or objects systematically, often by color or type 21 Interaction with Toys and Objects Play Behavior Stacking items like cans or blocks repeatedly 22 Interaction with Toys and Objects Play Behavior Fixation on spinning objects or wheels 23 Interaction with Toys and Objects Play Behavior Inspecting objects from unusual angles, such as sideways 24 Interaction with Toys and Objects Sensory Preferences Chewing or mouthing objects 25 Interaction with Toys and Objects Sensory Preferences Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy 26 Interaction with Toys and Objects Sensory Preferences Sensitivity to sounds, often covering ears 27 Interaction with Toys and Objects Sensory Preferences Visual inspection of objects up close or intensely 28 Gender and Developmental Nuances Gender-Based Masking Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent 29 Gender and Developmental Nuances Gender-Based Masking Girls may demonstrate learned emotional and social responses that obscure typical signs 30 Gender and Developmental Nuances Developmental Indicators Delays or atypical development in social communication and interaction milestones 31 Gender and Developmental Nuances Developmental Indicators Difficulty with back-and-forth conversation or social reciprocity 32 Your output should be a list of only the indicators that were observed in the video. Do not include any indicators for which evidence is low or non-existent """ # 3. Construct the messages with your fixed text messages = [ { "role": "user", "content": [ { "type": media_type, media_type: media_path, # Set any additional keys for video processing: **({"nframes": 128, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}), }, { "type": "text", "text": fixed_prompt_text }, ], } ] print("DEBUG MESSAGES:", messages) # 4. Prepare the text prompt for the Qwen2-VL model text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # 5. Prepare the image/video data image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to("cuda") # 6. Streaming output streamer = TextIteratorStreamer( processor, skip_prompt=True, **{"skip_special_tokens": True} ) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024) # 7. Launch generation in separate thread for streaming thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # 8. Stream partial outputs back buffer = "" for new_text in streamer: buffer += new_text yield buffer css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.Markdown(DESCRIPTION) with gr.Tab(label="Image/Video Input"): with gr.Row(): with gr.Column(): input_media = gr.File( label="Upload Image or Video", type="filepath" ) # 1) Remove the text_input box # text_input = gr.Textbox(label="Question") # removed submit_btn = gr.Button(value="Submit") with gr.Column(): output_text = gr.Textbox(label="Output Text") # 2) qwen_inference is now called with just the media input submit_btn.click( qwen_inference, [input_media], # no text_input argument [output_text] ) demo.launch(debug=True)