Spaces:

Deadmon
/

ocr-pdf

Running

App Files Files Community

Deadmon commited on Mar 9

Commit

24fc3ef

verified ·

1 Parent(s): a8afb71

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -16

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 from pathlib import Path
 import fitz  # PyMuPDF for PDF handling
 from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration  # For image captioning
 import torch
 import gradio as gr
@@ -25,8 +25,8 @@ def generate_page_image(pdf_path, page_num):
         height = rect.height
         # Calculate appropriate zoom factor to get good quality images
-        # Aim for approximately 2000 pixels on the longest side
-        zoom = 2000 / max(width, height)
         # Create a transformation matrix
         mat = fitz.Matrix(zoom, zoom)
@@ -67,23 +67,52 @@ def extract_text_from_pdf(pdf_path, page_num):
 def analyze_image(image_path):
     """
-    Analyze image content using BLIP model for image captioning
     """
     try:
-        # Load BLIP model and processor
-        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
         # Load and process image
         image = Image.open(image_path).convert('RGB')
-        inputs = processor(image, return_tensors="pt")
-        # Generate caption
         with torch.no_grad():
-            outputs = model.generate(**inputs)
-        caption = processor.decode(outputs[0], skip_special_tokens=True)
-        return caption
     except Exception as e:
         print(f"Error during image analysis: {str(e)}")
         return "Image content could not be analyzed."
@@ -159,7 +188,7 @@ interface = gr.Interface(
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.Textbox(label="Analysis Results"),
     title="PDF Analyzer",
-    description="Upload a PDF file to extract text directly and analyze images."
 )
 interface.launch()

 from pathlib import Path
 import fitz  # PyMuPDF for PDF handling
 from PIL import Image
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor  # For Qwen2.5 VL
 import torch
 import gradio as gr
         height = rect.height
         # Calculate appropriate zoom factor to get good quality images
+        # Aim for approximately 1000 pixels on the longest side (reduced for efficiency)
+        zoom = 1000 / max(width, height)
         # Create a transformation matrix
         mat = fitz.Matrix(zoom, zoom)
 def analyze_image(image_path):
     """
+    Analyze image content using Qwen2.5 VL model for detailed description
     """
     try:
+        # Load Qwen2.5 VL model and processor
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-72B-Instruct",
+            torch_dtype=torch.float16,  # Use float16 for efficiency
+            device_map="auto"  # Automatically distribute across available GPUs
+        )
+        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
         # Load and process image
         image = Image.open(image_path).convert('RGB')
+        # Prepare input for the model (image + prompt)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": "Provide a detailed description of the content in this image, focusing on text, layout, and any diagrams or figures."}
+                ]
+            }
+        ]
+        # Process the input
+        text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+        inputs = processor(
+            text=text_prompt,
+            images=[image],
+            padding=True,
+            return_tensors="pt"
+        )
+        # Move inputs to the appropriate device
+        inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
+        # Generate description
         with torch.no_grad():
+            output_ids = model.generate(**inputs, max_new_tokens=512)
+        generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
+        # Extract only the assistant's response (remove the prompt)
+        response = generated_text.split("Assistant: ")[1] if "Assistant: " in generated_text else generated_text
+        return response
     except Exception as e:
         print(f"Error during image analysis: {str(e)}")
         return "Image content could not be analyzed."
     inputs=gr.File(label="Upload PDF"),
     outputs=gr.Textbox(label="Analysis Results"),
     title="PDF Analyzer",
+    description="Upload a PDF file to extract text directly and analyze images using Qwen2.5 VL."
 )
 interface.launch()