Qwen2-VL-2B

Paused

File size: 9,349 Bytes

8e98e4c
 
 
a27e746
 
 
 
8e98e4c
8f558df
21fcfe6
868ab7f
0e31dfe
21fcfe6
8f558df
21fcfe6
a533ef3
425e364
868ab7f
 
 
a533ef3
868ab7f
95a4394
868ab7f
 
 
 
 
 
21fcfe6
885d899
21fcfe6
868ab7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21fcfe6
 
 
c403d08
 
 
 
 
 
 
868ab7f
 
 
 
 
 
 
c403d08
868ab7f
 
 
 
 
c403d08
868ab7f
 
 
c403d08
 
5dcca16
620ccea
c403d08
620ccea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c403d08
620ccea
c403d08
 
 
dcf6d05
868ab7f
dcf6d05
 
 
868ab7f
 
c403d08
de5d173
dcf6d05
c403d08
 
 
 
dcf6d05
 
 
868ab7f
3d209d0
 
c403d08
dcf6d05
c403d08
 
 
dcf6d05
c403d08
 
dcf6d05
 
 
 
 
 
 
868ab7f
 
c403d08
868ab7f
c403d08
 
 
dcf6d05
868ab7f
 
c403d08
868ab7f
 
 
c403d08
868ab7f
 
 
 
8f558df
c403d08
 
8f558df
 
 
 
 
 
 
 
 
 
868ab7f
 
8f558df
 
868ab7f
c403d08
 
868ab7f
c403d08
 
 
8f558df
 
 
21fcfe6
c403d08
868ab7f
c403d08
 
 
868ab7f
8f558df
755339c

import os
os.environ["FORCE_QWENVL_VIDEO_READER"] = "decord"

import sys
print("Startup check: Using FORCE_QWENVL_VIDEO_READER=", os.environ.get("FORCE_QWENVL_VIDEO_READER"), file=sys.stderr)
sys.stderr.flush()


import gradio as gr
import spaces
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from qwen_vl_utils import process_vision_info
import torch
from PIL import Image
import subprocess
import numpy as np
import os
from threading import Thread
import uuid
import io

# Model and Processor Loading (Done once at startup)
MODEL_ID = "omni-research/Tarsier2-Recap-7b"
model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to("cuda").eval()
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

DESCRIPTION = "Behavioral Video Analysis Demo"

image_extensions = Image.registered_extensions()
video_extensions = ("avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg", "wav", "gif", "webm", "m4v", "3gp")


def identify_and_save_blob(blob_path):
    """Identifies if the blob is an image or video and saves it accordingly."""
    try:
        with open(blob_path, 'rb') as file:
            blob_content = file.read()
            
            # Try to identify if it's an image
            try:
                Image.open(io.BytesIO(blob_content)).verify()  # Check if it's a valid image
                extension = ".png"  # Default to PNG for saving
                media_type = "image"
            except (IOError, SyntaxError):
                # If it's not a valid image, assume it's a video
                extension = ".mp4"  # Default to MP4 for saving
                media_type = "video"
            
            # Create a unique filename
            filename = f"temp_{uuid.uuid4()}_media{extension}"
            with open(filename, "wb") as f:
                f.write(blob_content)
                
            return filename, media_type
            
    except FileNotFoundError:
        raise ValueError(f"The file {blob_path} was not found.")
    except Exception as e:
        raise ValueError(f"An error occurred while processing the file: {e}")


@spaces.GPU
def qwen_inference(media_input):
    """
    We've removed the text_input parameter and switched to a 
    fixed prompt (hard-coded). 
    """

    # 1. Identify whether media_input is an image or video filepath
    if isinstance(media_input, str):  # If it's a filepath
        media_path = media_input
        if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
            media_type = "image"
        elif media_path.endswith(video_extensions): 
            media_type = "video"
        else:
            # If we don't recognize the file extension, try identify_and_save_blob
            try:
                media_path, media_type = identify_and_save_blob(media_input)
                print(media_path, media_type)
            except Exception as e:
                print(e)
                raise ValueError("Unsupported media type. Please upload an image or video.")

    print(media_path)

    # 2. Hard-code the text prompt here
    fixed_prompt_text = """

    Use the following typology to describe the behaviors of the child in the video
    
    indicator_1	indicator_2	indicator_3	sr_no
    Behavioral Category	Holding Objects	Holding two random objects, often simultaneously	1
    Behavioral Category	Holding Objects	Persistent attachment to specific objects	2
    Behavioral Category	Eye Contact and Engagement	Lack of eye contact or minimal eye engagement	3
    Behavioral Category	Eye Contact and Engagement	Focus on objects rather than people during interaction	4
    Behavioral Category	Eye Contact and Engagement	Unresponsive to name being called or other verbal cues	5
    Behavioral Category	Eye Contact and Engagement	Limited back-and-forth gaze between people and objects	6
    Behavioral Category	Facial Expressions	Flat or unexpressive face	7
    Behavioral Category	Facial Expressions	Limited range of facial expressions	8
    Behavioral Category	Facial Expressions	Occasional tense or grimacing facial posture	9
    Behavioral Category	Social Interaction	Lack of shared enjoyment or visible emotional connection during interactions	10
    Behavioral Category	Social Interaction	Disinterest in other people, even when they are engaging	11
    Behavioral Category	Social Interaction	Inconsistent or no acknowledgment of social gestures like pointing	12
    Movement and Gestures	Repetitive Movements	Hand flapping	13
    Movement and Gestures	Repetitive Movements	Toe walking or bouncing on toes	14
    Movement and Gestures	Repetitive Movements	Rocking back and forth, sometimes aggressively	15
    Movement and Gestures	Repetitive Movements	Pacing or repetitive movements in a fixed area	16
    Movement and Gestures	Repetitive Movements	Head shaking side to side	17
    Movement and Gestures	Repetitive Movements	Spinning	18
    Movement and Gestures	Gestural Communication	Using another person’s hand to point, request, or manipulate objects	19
    Movement and Gestures	Gestural Communication	Nodding	20
    Interaction with Toys and Objects	Play Behavior	Lining up toys or objects systematically, often by color or type	21
    Interaction with Toys and Objects	Play Behavior	Stacking items like cans or blocks repeatedly	22
    Interaction with Toys and Objects	Play Behavior	Fixation on spinning objects or wheels	23
    Interaction with Toys and Objects	Play Behavior	Inspecting objects from unusual angles, such as sideways	24
    Interaction with Toys and Objects	Sensory Preferences	Chewing or mouthing objects	25
    Interaction with Toys and Objects	Sensory Preferences	Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy	26
    Interaction with Toys and Objects	Sensory Preferences	Sensitivity to sounds, often covering ears	27
    Interaction with Toys and Objects	Sensory Preferences	Visual inspection of objects up close or intensely	28
    Gender and Developmental Nuances	Gender-Based Masking	Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent	29
    Gender and Developmental Nuances	Gender-Based Masking	Girls may demonstrate learned emotional and social responses that obscure typical signs	30
    Gender and Developmental Nuances	Developmental Indicators	Delays or atypical development in social communication and interaction milestones	31
    Gender and Developmental Nuances	Developmental Indicators	Difficulty with back-and-forth conversation or social reciprocity	32
    
    Your output should be a list of only the indicators that were observed in the video. Do not include any indicators for which evidence is low or non-existent
    """

    # 3. Construct the messages with your fixed text
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": media_type,
                    media_type: media_path,
                    # Set any additional keys for video processing:
                    **({"nframes": 128, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
                },
                {
                    "type": "text",
                    "text": fixed_prompt_text
                },
            ],
        }
    ]

    print("DEBUG MESSAGES:", messages)
    
    # 4. Prepare the text prompt for the Qwen2-VL model
    text = processor.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 5. Prepare the image/video data
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    # 6. Streaming output
    streamer = TextIteratorStreamer(
        processor, 
        skip_prompt=True, 
        **{"skip_special_tokens": True}
    )
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)

    # 7. Launch generation in separate thread for streaming
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # 8. Stream partial outputs back
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer



css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Tab(label="Image/Video Input"):
        with gr.Row():
            with gr.Column():
                input_media = gr.File(
                    label="Upload Image or Video", 
                    type="filepath" 
                )
                # 1) Remove the text_input box
                # text_input = gr.Textbox(label="Question")  # removed

                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        # 2) qwen_inference is now called with just the media input
        submit_btn.click(
            qwen_inference, 
            [input_media],  # no text_input argument
            [output_text]
        )

demo.launch(debug=True)