prithivMLmods commited on
Commit
3331201
·
verified ·
1 Parent(s): ebca0ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -8,6 +8,7 @@ import torch
8
  from PIL import Image
9
  import uuid
10
  import io
 
11
 
12
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
13
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -93,10 +94,19 @@ def model_inference(input_dict, history):
93
 
94
  # Apply chat template and process inputs
95
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
 
 
 
 
96
  inputs = processor(
97
  text=[prompt],
98
- images=[load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"],
99
- videos=[path for path, media_type in zip(media_paths, media_types) if media_type == "video"],
100
  return_tensors="pt",
101
  padding=True,
102
  ).to("cuda")
@@ -121,7 +131,6 @@ def model_inference(input_dict, history):
121
 
122
  # Example inputs
123
  examples = [
124
- [{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
125
  [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
126
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
127
  [{"text": "Describe the photo", "files": ["examples/3.png"]}],
@@ -132,6 +141,7 @@ examples = [
132
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
133
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
134
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 
135
  ]
136
 
137
  demo = gr.ChatInterface(
 
8
  from PIL import Image
9
  import uuid
10
  import io
11
+ import os
12
 
13
  # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
14
  MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 
94
 
95
  # Apply chat template and process inputs
96
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
97
+
98
+ # Prepare inputs for the processor
99
+ image_inputs = [load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"]
100
+ video_inputs = [path for path, media_type in zip(media_paths, media_types) if media_type == "video"]
101
+
102
+ # Ensure video_inputs is not empty
103
+ if not video_inputs:
104
+ video_inputs = None
105
+
106
  inputs = processor(
107
  text=[prompt],
108
+ images=image_inputs if image_inputs else None,
109
+ videos=video_inputs if video_inputs else None,
110
  return_tensors="pt",
111
  padding=True,
112
  ).to("cuda")
 
131
 
132
  # Example inputs
133
  examples = [
 
134
  [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
135
  [{"text": "summarize the letter", "files": ["examples/1.png"]}],
136
  [{"text": "Describe the photo", "files": ["examples/3.png"]}],
 
141
  [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
142
  [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
143
  [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
144
+ [{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
145
  ]
146
 
147
  demo = gr.ChatInterface(