Deadmon commited on
Commit
24fc3ef
·
verified ·
1 Parent(s): a8afb71

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -16
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  from pathlib import Path
3
  import fitz # PyMuPDF for PDF handling
4
  from PIL import Image
5
- from transformers import BlipProcessor, BlipForConditionalGeneration # For image captioning
6
  import torch
7
  import gradio as gr
8
 
@@ -25,8 +25,8 @@ def generate_page_image(pdf_path, page_num):
25
  height = rect.height
26
 
27
  # Calculate appropriate zoom factor to get good quality images
28
- # Aim for approximately 2000 pixels on the longest side
29
- zoom = 2000 / max(width, height)
30
 
31
  # Create a transformation matrix
32
  mat = fitz.Matrix(zoom, zoom)
@@ -67,23 +67,52 @@ def extract_text_from_pdf(pdf_path, page_num):
67
 
68
  def analyze_image(image_path):
69
  """
70
- Analyze image content using BLIP model for image captioning
71
  """
72
  try:
73
- # Load BLIP model and processor
74
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
75
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
76
-
 
 
 
 
77
  # Load and process image
78
  image = Image.open(image_path).convert('RGB')
79
- inputs = processor(image, return_tensors="pt")
80
-
81
- # Generate caption
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  with torch.no_grad():
83
- outputs = model.generate(**inputs)
84
- caption = processor.decode(outputs[0], skip_special_tokens=True)
85
-
86
- return caption
 
 
 
87
  except Exception as e:
88
  print(f"Error during image analysis: {str(e)}")
89
  return "Image content could not be analyzed."
@@ -159,7 +188,7 @@ interface = gr.Interface(
159
  inputs=gr.File(label="Upload PDF"),
160
  outputs=gr.Textbox(label="Analysis Results"),
161
  title="PDF Analyzer",
162
- description="Upload a PDF file to extract text directly and analyze images."
163
  )
164
 
165
  interface.launch()
 
2
  from pathlib import Path
3
  import fitz # PyMuPDF for PDF handling
4
  from PIL import Image
5
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor # For Qwen2.5 VL
6
  import torch
7
  import gradio as gr
8
 
 
25
  height = rect.height
26
 
27
  # Calculate appropriate zoom factor to get good quality images
28
+ # Aim for approximately 1000 pixels on the longest side (reduced for efficiency)
29
+ zoom = 1000 / max(width, height)
30
 
31
  # Create a transformation matrix
32
  mat = fitz.Matrix(zoom, zoom)
 
67
 
68
  def analyze_image(image_path):
69
  """
70
+ Analyze image content using Qwen2.5 VL model for detailed description
71
  """
72
  try:
73
+ # Load Qwen2.5 VL model and processor
74
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
75
+ "Qwen/Qwen2-VL-72B-Instruct",
76
+ torch_dtype=torch.float16, # Use float16 for efficiency
77
+ device_map="auto" # Automatically distribute across available GPUs
78
+ )
79
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
80
+
81
  # Load and process image
82
  image = Image.open(image_path).convert('RGB')
83
+
84
+ # Prepare input for the model (image + prompt)
85
+ messages = [
86
+ {
87
+ "role": "user",
88
+ "content": [
89
+ {"type": "image", "image": image},
90
+ {"type": "text", "text": "Provide a detailed description of the content in this image, focusing on text, layout, and any diagrams or figures."}
91
+ ]
92
+ }
93
+ ]
94
+
95
+ # Process the input
96
+ text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
97
+ inputs = processor(
98
+ text=text_prompt,
99
+ images=[image],
100
+ padding=True,
101
+ return_tensors="pt"
102
+ )
103
+
104
+ # Move inputs to the appropriate device
105
+ inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
106
+
107
+ # Generate description
108
  with torch.no_grad():
109
+ output_ids = model.generate(**inputs, max_new_tokens=512)
110
+ generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
111
+
112
+ # Extract only the assistant's response (remove the prompt)
113
+ response = generated_text.split("Assistant: ")[1] if "Assistant: " in generated_text else generated_text
114
+
115
+ return response
116
  except Exception as e:
117
  print(f"Error during image analysis: {str(e)}")
118
  return "Image content could not be analyzed."
 
188
  inputs=gr.File(label="Upload PDF"),
189
  outputs=gr.Textbox(label="Analysis Results"),
190
  title="PDF Analyzer",
191
+ description="Upload a PDF file to extract text directly and analyze images using Qwen2.5 VL."
192
  )
193
 
194
  interface.launch()