omer-bhutta commited on
Commit
c403d08
·
verified ·
1 Parent(s): dc375d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -11
app.py CHANGED
@@ -64,7 +64,13 @@ def identify_and_save_blob(blob_path):
64
 
65
 
66
  @spaces.GPU
67
- def qwen_inference(media_input, text_input=None):
 
 
 
 
 
 
68
  if isinstance(media_input, str): # If it's a filepath
69
  media_path = media_input
70
  if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
@@ -72,18 +78,58 @@ def qwen_inference(media_input, text_input=None):
72
  elif media_path.endswith(video_extensions):
73
  media_type = "video"
74
  else:
 
75
  try:
76
  media_path, media_type = identify_and_save_blob(media_input)
77
  print(media_path, media_type)
78
  except Exception as e:
79
  print(e)
80
- raise ValueError(
81
- "Unsupported media type. Please upload an image or video."
82
- )
83
-
84
 
85
  print(media_path)
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  messages = [
88
  {
89
  "role": "user",
@@ -91,18 +137,27 @@ def qwen_inference(media_input, text_input=None):
91
  {
92
  "type": media_type,
93
  media_type: media_path,
 
94
  **({"nframes": 16, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
95
  },
96
- {"type": "text", "text": text_input},
 
 
 
97
  ],
98
  }
99
  ]
100
 
101
  print("DEBUG MESSAGES:", messages)
102
 
 
103
  text = processor.apply_chat_template(
104
- messages, tokenize=False, add_generation_prompt=True
 
 
105
  )
 
 
106
  image_inputs, video_inputs = process_vision_info(messages)
107
  inputs = processor(
108
  text=[text],
@@ -112,19 +167,26 @@ def qwen_inference(media_input, text_input=None):
112
  return_tensors="pt",
113
  ).to("cuda")
114
 
 
115
  streamer = TextIteratorStreamer(
116
- processor, skip_prompt=True, **{"skip_special_tokens": True}
 
 
117
  )
118
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
119
 
 
120
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
121
  thread.start()
122
 
 
123
  buffer = ""
124
  for new_text in streamer:
125
  buffer += new_text
126
  yield buffer
127
 
 
 
128
  css = """
129
  #output {
130
  height: 500px;
@@ -140,15 +202,21 @@ with gr.Blocks(css=css) as demo:
140
  with gr.Row():
141
  with gr.Column():
142
  input_media = gr.File(
143
- label="Upload Image or Video", type="filepath"
 
144
  )
145
- text_input = gr.Textbox(label="Question")
 
 
146
  submit_btn = gr.Button(value="Submit")
147
  with gr.Column():
148
  output_text = gr.Textbox(label="Output Text")
149
 
 
150
  submit_btn.click(
151
- qwen_inference, [input_media, text_input], [output_text]
 
 
152
  )
153
 
154
  demo.launch(debug=True)
 
64
 
65
 
66
  @spaces.GPU
67
+ def qwen_inference(media_input):
68
+ """
69
+ We've removed the text_input parameter and switched to a
70
+ fixed prompt (hard-coded).
71
+ """
72
+
73
+ # 1. Identify whether media_input is an image or video filepath
74
  if isinstance(media_input, str): # If it's a filepath
75
  media_path = media_input
76
  if media_path.endswith(tuple([i for i, f in image_extensions.items()])):
 
78
  elif media_path.endswith(video_extensions):
79
  media_type = "video"
80
  else:
81
+ # If we don't recognize the file extension, try identify_and_save_blob
82
  try:
83
  media_path, media_type = identify_and_save_blob(media_input)
84
  print(media_path, media_type)
85
  except Exception as e:
86
  print(e)
87
+ raise ValueError("Unsupported media type. Please upload an image or video.")
 
 
 
88
 
89
  print(media_path)
90
 
91
+ # 2. Hard-code the text prompt here
92
+ fixed_prompt_text = """
93
+ Use the following typology to describe the behaviors of the child in the video
94
+
95
+ indicator_1 indicator_2 indicator_3 sr_no
96
+ Behavioral Category Holding Objects Holding two random objects, often simultaneously 1
97
+ Behavioral Category Holding Objects Persistent attachment to specific objects 2
98
+ Behavioral Category Eye Contact and Engagement Lack of eye contact or minimal eye engagement 3
99
+ Behavioral Category Eye Contact and Engagement Focus on objects rather than people during interaction 4
100
+ Behavioral Category Eye Contact and Engagement Unresponsive to name being called or other verbal cues 5
101
+ Behavioral Category Eye Contact and Engagement Limited back-and-forth gaze between people and objects 6
102
+ Behavioral Category Facial Expressions Flat or unexpressive face 7
103
+ Behavioral Category Facial Expressions Limited range of facial expressions 8
104
+ Behavioral Category Facial Expressions Occasional tense or grimacing facial posture 9
105
+ Behavioral Category Social Interaction Lack of shared enjoyment or visible emotional connection during interactions 10
106
+ Behavioral Category Social Interaction Disinterest in other people, even when they are engaging 11
107
+ Behavioral Category Social Interaction Inconsistent or no acknowledgment of social gestures like pointing 12
108
+ Movement and Gestures Repetitive Movements Hand flapping 13
109
+ Movement and Gestures Repetitive Movements Toe walking or bouncing on toes 14
110
+ Movement and Gestures Repetitive Movements Rocking back and forth, sometimes aggressively 15
111
+ Movement and Gestures Repetitive Movements Pacing or repetitive movements in a fixed area 16
112
+ Movement and Gestures Repetitive Movements Head shaking side to side 17
113
+ Movement and Gestures Repetitive Movements Spinning 18
114
+ Movement and Gestures Gestural Communication Using another person’s hand to point, request, or manipulate objects 19
115
+ Movement and Gestures Gestural Communication Nodding 20
116
+ Interaction with Toys and Objects Play Behavior Lining up toys or objects systematically, often by color or type 21
117
+ Interaction with Toys and Objects Play Behavior Stacking items like cans or blocks repeatedly 22
118
+ Interaction with Toys and Objects Play Behavior Fixation on spinning objects or wheels 23
119
+ Interaction with Toys and Objects Play Behavior Inspecting objects from unusual angles, such as sideways 24
120
+ Interaction with Toys and Objects Sensory Preferences Chewing or mouthing objects 25
121
+ Interaction with Toys and Objects Sensory Preferences Sensory-seeking behaviors like rubbing textures or spinning in circles without getting dizzy 26
122
+ Interaction with Toys and Objects Sensory Preferences Sensitivity to sounds, often covering ears 27
123
+ Interaction with Toys and Objects Sensory Preferences Visual inspection of objects up close or intensely 28
124
+ Gender and Developmental Nuances Gender-Based Masking Females may mimic or "mask" typical behaviors more effectively, making symptoms less apparent 29
125
+ Gender and Developmental Nuances Gender-Based Masking Girls may demonstrate learned emotional and social responses that obscure typical signs 30
126
+ Gender and Developmental Nuances Developmental Indicators Delays or atypical development in social communication and interaction milestones 31
127
+ Gender and Developmental Nuances Developmental Indicators Difficulty with back-and-forth conversation or social reciprocity 32
128
+
129
+ Your output should indicate for each indicator if the behavior specified in that row is visible in the video or not
130
+ """
131
+
132
+ # 3. Construct the messages with your fixed text
133
  messages = [
134
  {
135
  "role": "user",
 
137
  {
138
  "type": media_type,
139
  media_type: media_path,
140
+ # Set any additional keys for video processing:
141
  **({"nframes": 16, "resized_width": 224, "resized_height": 224} if media_type == "video" else {}),
142
  },
143
+ {
144
+ "type": "text",
145
+ "text": fixed_prompt_text
146
+ },
147
  ],
148
  }
149
  ]
150
 
151
  print("DEBUG MESSAGES:", messages)
152
 
153
+ # 4. Prepare the text prompt for the Qwen2-VL model
154
  text = processor.apply_chat_template(
155
+ messages,
156
+ tokenize=False,
157
+ add_generation_prompt=True
158
  )
159
+
160
+ # 5. Prepare the image/video data
161
  image_inputs, video_inputs = process_vision_info(messages)
162
  inputs = processor(
163
  text=[text],
 
167
  return_tensors="pt",
168
  ).to("cuda")
169
 
170
+ # 6. Streaming output
171
  streamer = TextIteratorStreamer(
172
+ processor,
173
+ skip_prompt=True,
174
+ **{"skip_special_tokens": True}
175
  )
176
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
177
 
178
+ # 7. Launch generation in separate thread for streaming
179
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
180
  thread.start()
181
 
182
+ # 8. Stream partial outputs back
183
  buffer = ""
184
  for new_text in streamer:
185
  buffer += new_text
186
  yield buffer
187
 
188
+
189
+
190
  css = """
191
  #output {
192
  height: 500px;
 
202
  with gr.Row():
203
  with gr.Column():
204
  input_media = gr.File(
205
+ label="Upload Image or Video",
206
+ type="filepath"
207
  )
208
+ # 1) Remove the text_input box
209
+ # text_input = gr.Textbox(label="Question") # removed
210
+
211
  submit_btn = gr.Button(value="Submit")
212
  with gr.Column():
213
  output_text = gr.Textbox(label="Output Text")
214
 
215
+ # 2) qwen_inference is now called with just the media input
216
  submit_btn.click(
217
+ qwen_inference,
218
+ [input_media], # no text_input argument
219
+ [output_text]
220
  )
221
 
222
  demo.launch(debug=True)