Qwen2-VL-2B

Paused

App Files Files Community

omer-bhutta commited on Jan 7

Commit

c403d08

verified ·

1 Parent(s): dc375d3

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -11

app.py CHANGED Viewed

@@ -64,7 +64,13 @@ def identify_and_save_blob(blob_path):
 @spaces.GPU
-def qwen_inference(media_input, text_input=None):
     if isinstance(media_input, str):  # If it's a filepath
         media_path = media_input
         if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
@@ -72,18 +78,58 @@ def qwen_inference(media_input, text_input=None):
         elif media_path.endswith(video_extensions):
             media_type = "video"
         else:
             try:
                 media_path, media_type = identify_and_save_blob(media_input)
                 print(media_path, media_type)
             except Exception as e:
                 print(e)
-                raise ValueError(
-                    "Unsupported media type. Please upload an image or video."
-                )
     print(media_path)
     messages = [
         {
             "role": "user",
@@ -91,18 +137,27 @@ def qwen_inference(media_input, text_input=None):
                 {
                     "type": media_type,
                     media_type: media_path,
                     **({"nframes": 16, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
                 },
-                {"type": "text", "text": text_input},
             ],
         }
     ]
     print("DEBUG MESSAGES:", messages)
     text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
@@ -112,19 +167,26 @@ def qwen_inference(media_input, text_input=None):
         return_tensors="pt",
     ).to("cuda")
     streamer = TextIteratorStreamer(
-        processor, skip_prompt=True, **{"skip_special_tokens": True}
     )
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         yield buffer
 css = """
   #output {
     height: 500px;
@@ -140,15 +202,21 @@ with gr.Blocks(css=css) as demo:
         with gr.Row():
             with gr.Column():
                 input_media = gr.File(
-                    label="Upload Image or Video", type="filepath"
                 )
-                text_input = gr.Textbox(label="Question")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
         submit_btn.click(
-            qwen_inference, [input_media, text_input], [output_text]
         )
 demo.launch(debug=True)

 @spaces.GPU
+def qwen_inference(media_input):
+    """
+    We've removed the text_input parameter and switched to a
+    fixed prompt (hard-coded).
+    """
+    # 1. Identify whether media_input is an image or video filepath
     if isinstance(media_input, str):  # If it's a filepath
         media_path = media_input
         if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
         elif media_path.endswith(video_extensions):
             media_type = "video"
         else:
+            # If we don't recognize the file extension, try identify_and_save_blob
             try:
                 media_path, media_type = identify_and_save_blob(media_input)
                 print(media_path, media_type)
             except Exception as e:
                 print(e)
+                raise ValueError("Unsupported media type. Please upload an image or video.")
     print(media_path)
+    # 2. Hard-code the text prompt here
+    fixed_prompt_text = """
+    Use the following typology to describe the behaviors of the child in the video
+    indicator_1	indicator_2	indicator_3	sr_no
+    Behavioral Category	Holding Objects	Holding two random objects, often simultaneously	1
+    Behavioral Category	Holding Objects	Persistent attachment to specific objects	2
+    Behavioral Category	Eye Contact and Engagement	Lack of eye contact or minimal eye engagement	3
+    Behavioral Category	Eye Contact and Engagement	Focus on objects rather than people during interaction	4
+    Behavioral Category	Eye Contact and Engagement	Unresponsive to name being called or other verbal cues	5
+    Behavioral Category	Eye Contact and Engagement	Limited back-and-forth gaze between people and objects	6
+    Behavioral Category	Facial Expressions	Flat or unexpressive face	7
+    Behavioral Category	Facial Expressions	Limited range of facial expressions	8
+    Behavioral Category	Facial Expressions	Occasional tense or grimacing facial posture	9
+    Behavioral Category	Social Interaction	Lack of shared enjoyment or visible emotional connection during interactions	10
+    Behavioral Category	Social Interaction	Disinterest in other people, even when they are engaging	11
+    Behavioral Category	Social Interaction	Inconsistent or no acknowledgment of social gestures like pointing	12
+    Movement and Gestures	Repetitive Movements	Hand flapping	13
+    Movement and Gestures	Repetitive Movements	Toe walking or bouncing on toes	14
+    Movement and Gestures	Repetitive Movements	Rocking back and forth, sometimes aggressively	15
+    Movement and Gestures	Repetitive Movements	Pacing or repetitive movements in a fixed area	16
+    Movement and Gestures	Repetitive Movements	Head shaking side to side	17
+    Movement and Gestures	Repetitive Movements	Spinning	18
+    Movement and Gestures	Gestural Communication	Using another person’s hand to point, request, or manipulate objects	19
+    Movement and Gestures	Gestural Communication	Nodding	20
+    Interaction with Toys and Objects	Play Behavior	Lining up toys or objects systematically, often by color or type	21
+    Interaction with Toys and Objects	Play Behavior	Stacking items like cans or blocks repeatedly	22
+    Interaction with Toys and Objects	Play Behavior	Fixation on spinning objects or wheels	23
+    Interaction with Toys and Objects	Play Behavior	Inspecting objects from unusual angles, such as sideways	24
+    Interaction with Toys and Objects	Sensory Preferences	Chewing or mouthing objects	25
+    Interaction with Toys and Objects	Sensory Preferences	Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy	26
+    Interaction with Toys and Objects	Sensory Preferences	Sensitivity to sounds, often covering ears	27
+    Interaction with Toys and Objects	Sensory Preferences	Visual inspection of objects up close or intensely	28
+    Gender and Developmental Nuances	Gender-Based Masking	Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent	29
+    Gender and Developmental Nuances	Gender-Based Masking	Girls may demonstrate learned emotional and social responses that obscure typical signs	30
+    Gender and Developmental Nuances	Developmental Indicators	Delays or atypical development in social communication and interaction milestones	31
+    Gender and Developmental Nuances	Developmental Indicators	Difficulty with back-and-forth conversation or social reciprocity	32
+    Your output should indicate for each indicator if the behavior specified in that row is visible in the video or not
+    """
+    # 3. Construct the messages with your fixed text
     messages = [
         {
             "role": "user",
                 {
                     "type": media_type,
                     media_type: media_path,
+                    # Set any additional keys for video processing:
                     **({"nframes": 16, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
                 },
+                {
+                    "type": "text",
+                    "text": fixed_prompt_text
+                },
             ],
         }
     ]
     print("DEBUG MESSAGES:", messages)
+    # 4. Prepare the text prompt for the Qwen2-VL model
     text = processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
+    # 5. Prepare the image/video data
     image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
         return_tensors="pt",
     ).to("cuda")
+    # 6. Streaming output
     streamer = TextIteratorStreamer(
+        processor,
+        skip_prompt=True,
+        **{"skip_special_tokens": True}
     )
     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+    # 7. Launch generation in separate thread for streaming
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # 8. Stream partial outputs back
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         yield buffer
 css = """
   #output {
     height: 500px;
         with gr.Row():
             with gr.Column():
                 input_media = gr.File(
+                    label="Upload Image or Video",
+                    type="filepath"
                 )
+                # 1) Remove the text_input box
+                # text_input = gr.Textbox(label="Question")  # removed
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
                 output_text = gr.Textbox(label="Output Text")
+        # 2) qwen_inference is now called with just the media input
         submit_btn.click(
+            qwen_inference,
+            [input_media],  # no text_input argument
+            [output_text]
         )
 demo.launch(debug=True)