File size: 6,648 Bytes
ab9d843 ab366bd ab9d843 24fc3ef ab9d843 ab366bd ab9d843 ab366bd 36ada58 ab9d843 36ada58 ab9d843 36ada58 24fc3ef 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 24fc3ef ab9d843 24fc3ef ab9d843 24fc3ef ab9d843 24fc3ef ab9d843 36ada58 ab9d843 36ada58 ab9d843 36ada58 ab9d843 ab366bd 24fc3ef ab366bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
import os
from pathlib import Path
import fitz # PyMuPDF for PDF handling
from PIL import Image
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor # For Qwen2.5 VL
import torch
import gradio as gr
# Create output directory
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)
def generate_page_image(pdf_path, page_num):
"""
Generate an image from a specific PDF page for analysis
"""
try:
# Open the PDF
pdf_document = fitz.open(pdf_path)
page = pdf_document[page_num]
# Get the page dimensions to determine appropriate resolution
rect = page.rect
width = rect.width
height = rect.height
# Calculate appropriate zoom factor to get good quality images
# Aim for approximately 1000 pixels on the longest side (reduced for efficiency)
zoom = 1000 / max(width, height)
# Create a transformation matrix
mat = fitz.Matrix(zoom, zoom)
# Render page to an image
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Save image
image_path = OUTPUT_DIR / f"page_{page_num + 1}.png"
img.save(image_path, "PNG")
pdf_document.close()
return image_path
except Exception as e:
print(f"Error generating image for page {page_num + 1}: {str(e)}")
return None
def extract_text_from_pdf(pdf_path, page_num):
"""
Extract text directly from a specific PDF page
"""
try:
# Open the PDF
pdf_document = fitz.open(pdf_path)
page = pdf_document[page_num]
# Extract text
text = page.get_text("text")
pdf_document.close()
return text.strip()
except Exception as e:
print(f"Error extracting text from page {page_num + 1}: {str(e)}")
return ""
def analyze_image(image_path):
"""
Analyze image content using Qwen2.5 VL model for detailed description
"""
try:
# Load Qwen2.5 VL model and processor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2-VL-72B-Instruct",
torch_dtype=torch.float16, # Use float16 for efficiency
device_map="auto" # Automatically distribute across available GPUs
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
# Load and process image
image = Image.open(image_path).convert('RGB')
# Prepare input for the model (image + prompt)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Provide a detailed description of the content in this image, focusing on text, layout, and any diagrams or figures."}
]
}
]
# Process the input
text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
text=text_prompt,
images=[image],
padding=True,
return_tensors="pt"
)
# Move inputs to the appropriate device
inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
# Generate description
with torch.no_grad():
output_ids = model.generate(**inputs, max_new_tokens=512)
generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
# Extract only the assistant's response (remove the prompt)
response = generated_text.split("Assistant: ")[1] if "Assistant: " in generated_text else generated_text
return response
except Exception as e:
print(f"Error during image analysis: {str(e)}")
return "Image content could not be analyzed."
def process_pdf(pdf_path, output_txt_path):
"""
Main function to process the PDF and generate output
"""
try:
# Open the PDF to get page count
pdf_document = fitz.open(pdf_path)
num_pages = len(pdf_document)
pdf_document.close()
if num_pages == 0:
print("The PDF is empty.")
return
# Prepare output file
with open(output_txt_path, 'w', encoding='utf-8') as f:
f.write(f"Analysis of {os.path.basename(pdf_path)}\n")
f.write("=" * 50 + "\n\n")
# Process each page
for page_num in range(num_pages):
print(f"Processing page {page_num + 1}...")
# Write page header
f.write(f"Page {page_num + 1}\n")
f.write("-" * 30 + "\n\n")
# Extract and write text
text = extract_text_from_pdf(pdf_path, page_num)
if text:
f.write("Extracted Text:\n")
f.write(text)
f.write("\n\n")
else:
f.write("No text could be extracted from this page.\n\n")
# Generate image for analysis and write description
image_path = generate_page_image(pdf_path, page_num)
if image_path:
description = analyze_image(image_path)
f.write("Image Description:\n")
f.write(f"{description}\n")
f.write("\n" + "=" * 50 + "\n\n")
else:
f.write("Image Description:\n")
f.write("Could not generate image for analysis.\n")
f.write("\n" + "=" * 50 + "\n\n")
print(f"Processing complete. Results saved to {output_txt_path}")
except Exception as e:
print(f"Error processing PDF: {str(e)}")
def process_uploaded_pdf(pdf_file):
if pdf_file is None:
return "Please upload a PDF file."
output_txt = OUTPUT_DIR / "analysis_results.txt"
process_pdf(pdf_file.name, output_txt)
# Read and return the results
with open(output_txt, 'r', encoding='utf-8') as f:
results = f.read()
return results
# Create Gradio interface
interface = gr.Interface(
fn=process_uploaded_pdf,
inputs=gr.File(label="Upload PDF"),
outputs=gr.Textbox(label="Analysis Results"),
title="PDF Analyzer",
description="Upload a PDF file to extract text directly and analyze images using Qwen2.5 VL."
)
interface.launch() |