mknolan's picture
Upload app.py
6bda7a2 verified
import os
import sys
import torch
import tempfile
from PIL import Image
import gradio as gr
import pdf2image
from transformers import AutoModel, AutoTokenizer
import torchvision.transforms as transforms
# Configuration
MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
IMAGE_SIZE = 448
# Model loading function
def load_model():
print(f"\n=== Loading {MODEL_NAME} ===")
print(f"CUDA available: {torch.cuda.is_available()}")
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load model and tokenizer with minimal options to avoid compatibility issues
try:
model = AutoModel.from_pretrained(
MODEL_NAME,
trust_remote_code=True,
device_map="auto" if torch.cuda.is_available() else None
)
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
use_fast=False,
trust_remote_code=True
)
print(f"βœ“ Model and tokenizer loaded successfully!")
return model, tokenizer
except Exception as e:
print(f"❌ Error loading model: {e}")
import traceback
traceback.print_exc()
return None, None
# Extract slides from uploaded PDF file
def extract_slides_from_pdf(file_obj):
try:
file_bytes = file_obj.read()
file_extension = os.path.splitext(file_obj.name)[1].lower()
# Check if it's a PDF
if file_extension != '.pdf':
return []
# Create temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
temp_file.write(file_bytes)
temp_path = temp_file.name
# Extract images from PDF using pdf2image
slides = []
try:
images = pdf2image.convert_from_path(temp_path, dpi=300)
slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
except Exception as e:
print(f"Error converting PDF: {e}")
# Clean up temporary file
os.unlink(temp_path)
return slides
except Exception as e:
import traceback
error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
print(error_msg)
return []
# Simple preprocessing for a single image
def preprocess_image(image):
# Resize image to expected size
img = image.resize((IMAGE_SIZE, IMAGE_SIZE))
# Convert PIL image to tensor and normalize
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Apply transformation and add batch dimension
img_tensor = transform(img).unsqueeze(0)
# Move tensor to GPU if available
if torch.cuda.is_available():
img_tensor = img_tensor.cuda()
return img_tensor
# Image analysis function - using simple approach
def analyze_image(model, tokenizer, image, prompt):
try:
# Check if image is valid
if image is None:
return "Please upload an image first."
# Process the image with simple preprocessing
processed_image = preprocess_image(image)
# Simple prompt format
question = f"<image>\n{prompt}"
# Use the model's chat method
response, _ = model.chat(
tokenizer=tokenizer,
pixel_values=processed_image,
question=question,
history=None,
return_history=True
)
return response
except Exception as e:
import traceback
error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
return error_msg
# Analyze multiple slides from a PDF
def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
try:
if file_obj is None:
return "Please upload a PDF file."
# Extract slides from PDF
slides = extract_slides_from_pdf(file_obj)
if not slides:
return "No slides were extracted from the file. Please check that it's a valid PDF."
# Limit to the requested number of slides
slides = slides[:num_slides]
# Analyze each slide
analyses = []
for slide_title, slide_image in slides:
analysis = analyze_image(model, tokenizer, slide_image, prompt)
analyses.append((slide_title, analysis))
# Format the results
result = ""
for slide_title, analysis in analyses:
result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"
return result
except Exception as e:
import traceback
error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
return error_msg
# Main function
def main():
# Load the model
model, tokenizer = load_model()
if model is None:
# Create an error interface if model loading failed
demo = gr.Interface(
fn=lambda x: "Model loading failed. Please check the logs for details.",
inputs=gr.Textbox(),
outputs=gr.Textbox(),
title="InternVL2.5 Slide Analyzer - Error",
description="The model failed to load. Please check the logs for more information."
)
return demo
# Create a simple interface
with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
gr.Markdown("Upload a PDF file and analyze multiple slides")
# PDF Analysis tab
slide_prompts = [
"Analyze this slide and describe its contents.",
"What is the main message of this slide?",
"Extract all the text visible in this slide.",
"What are the key points presented in this slide?",
"Describe the visual elements and layout of this slide."
]
with gr.Row():
file_input = gr.File(label="Upload PDF")
slide_prompt = gr.Dropdown(
choices=slide_prompts,
value=slide_prompts[0],
label="Select a prompt",
allow_custom_value=True
)
num_slides = gr.Slider(
minimum=1,
maximum=5,
value=2,
step=1,
label="Number of Slides to Analyze"
)
slides_analyze_btn = gr.Button("Analyze Slides")
slides_output = gr.Markdown(label="Analysis Results")
# Handle the slides analysis action
slides_analyze_btn.click(
fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
inputs=[file_input, slide_prompt, num_slides],
outputs=slides_output
)
# Add example if available
if os.path.exists("example_slides/test_slides.pdf"):
gr.Examples(
examples=[
["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
],
inputs=[file_input, slide_prompt, num_slides]
)
return demo
# Run the application
if __name__ == "__main__":
try:
# Create and launch the interface
demo = main()
demo.launch(server_name="0.0.0.0")
except Exception as e:
print(f"Error starting the application: {e}")
import traceback
traceback.print_exc()