Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 4

Commit

3331201

verified ·

1 Parent(s): ebca0ae

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -3

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import torch
 from PIL import Image
 import uuid
 import io
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -93,10 +94,19 @@ def model_inference(input_dict, history):
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
-        images=[load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"],
-        videos=[path for path, media_type in zip(media_paths, media_types) if media_type == "video"],
         return_tensors="pt",
         padding=True,
     ).to("cuda")
@@ -121,7 +131,6 @@ def model_inference(input_dict, history):
 # Example inputs
 examples = [
-    [{"text": "Describe the video.", "files": ["examples/demo.mp4"]}],
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
@@ -132,6 +141,7 @@ examples = [
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
 ]
 demo = gr.ChatInterface(

 from PIL import Image
 import uuid
 import io
+import os
 # Fine-tuned for OCR-based tasks from Qwen's [ Qwen/Qwen2-VL-2B-Instruct ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
     # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Prepare inputs for the processor
+    image_inputs = [load_image(path) for path, media_type in zip(media_paths, media_types) if media_type == "image"]
+    video_inputs = [path for path, media_type in zip(media_paths, media_types) if media_type == "video"]
+    # Ensure video_inputs is not empty
+    if not video_inputs:
+        video_inputs = None
     inputs = processor(
         text=[prompt],
+        images=image_inputs if image_inputs else None,
+        videos=video_inputs if video_inputs else None,
         return_tensors="pt",
         padding=True,
     ).to("cuda")
 # Example inputs
 examples = [
     [{"text": "Extract JSON from the image", "files": ["example_images/document.jpg"]}],
     [{"text": "summarize the letter", "files": ["examples/1.png"]}],
     [{"text": "Describe the photo", "files": ["examples/3.png"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/newyork.jpg"]}],
     [{"text": "Can you describe this image?", "files": ["example_images/dogs.jpg"]}],
     [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
+    [{"text": "Describe the video.", "files": ["example_videos/sample.mp4"]}],
 ]
 demo = gr.ChatInterface(