Spaces:

mknolan
/

internvl25-slide-analyzer-simple

Paused

App Files Files Community

internvl25-slide-analyzer-simple / app.py

mknolan

Upload app.py

6bda7a2 verified about 2 months ago

raw

history blame contribute delete

7.68 kB

	import os
	import sys
	import torch
	import tempfile
	from PIL import Image
	import gradio as gr
	import pdf2image
	from transformers import AutoModel, AutoTokenizer
	import torchvision.transforms as transforms

	# Configuration
	MODEL_NAME = "OpenGVLab/InternVL2_5-8B"
	IMAGE_SIZE = 448

	# Model loading function
	def load_model():
	print(f"\n=== Loading {MODEL_NAME} ===")
	print(f"CUDA available: {torch.cuda.is_available()}")

	# Set device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Load model and tokenizer with minimal options to avoid compatibility issues
	try:
	model = AutoModel.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True,
	device_map="auto" if torch.cuda.is_available() else None
	)

	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	use_fast=False,
	trust_remote_code=True
	)

	print(f"✓ Model and tokenizer loaded successfully!")
	return model, tokenizer
	except Exception as e:
	print(f"❌ Error loading model: {e}")
	import traceback
	traceback.print_exc()
	return None, None

	# Extract slides from uploaded PDF file
	def extract_slides_from_pdf(file_obj):
	try:
	file_bytes = file_obj.read()
	file_extension = os.path.splitext(file_obj.name)[1].lower()

	# Check if it's a PDF
	if file_extension != '.pdf':
	return []

	# Create temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
	temp_file.write(file_bytes)
	temp_path = temp_file.name

	# Extract images from PDF using pdf2image
	slides = []
	try:
	images = pdf2image.convert_from_path(temp_path, dpi=300)
	slides = [(f"Slide {i+1}", img) for i, img in enumerate(images)]
	except Exception as e:
	print(f"Error converting PDF: {e}")

	# Clean up temporary file
	os.unlink(temp_path)

	return slides

	except Exception as e:
	import traceback
	error_msg = f"Error extracting slides: {str(e)}\n{traceback.format_exc()}"
	print(error_msg)
	return []

	# Simple preprocessing for a single image
	def preprocess_image(image):
	# Resize image to expected size
	img = image.resize((IMAGE_SIZE, IMAGE_SIZE))

	# Convert PIL image to tensor and normalize
	transform = transforms.Compose([
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	# Apply transformation and add batch dimension
	img_tensor = transform(img).unsqueeze(0)

	# Move tensor to GPU if available
	if torch.cuda.is_available():
	img_tensor = img_tensor.cuda()

	return img_tensor

	# Image analysis function - using simple approach
	def analyze_image(model, tokenizer, image, prompt):
	try:
	# Check if image is valid
	if image is None:
	return "Please upload an image first."

	# Process the image with simple preprocessing
	processed_image = preprocess_image(image)

	# Simple prompt format
	question = f"<image>\n{prompt}"

	# Use the model's chat method
	response, _ = model.chat(
	tokenizer=tokenizer,
	pixel_values=processed_image,
	question=question,
	history=None,
	return_history=True
	)

	return response
	except Exception as e:
	import traceback
	error_msg = f"Error analyzing image: {str(e)}\n{traceback.format_exc()}"
	return error_msg

	# Analyze multiple slides from a PDF
	def analyze_pdf_slides(model, tokenizer, file_obj, prompt, num_slides=2):
	try:
	if file_obj is None:
	return "Please upload a PDF file."

	# Extract slides from PDF
	slides = extract_slides_from_pdf(file_obj)

	if not slides:
	return "No slides were extracted from the file. Please check that it's a valid PDF."

	# Limit to the requested number of slides
	slides = slides[:num_slides]

	# Analyze each slide
	analyses = []
	for slide_title, slide_image in slides:
	analysis = analyze_image(model, tokenizer, slide_image, prompt)
	analyses.append((slide_title, analysis))

	# Format the results
	result = ""
	for slide_title, analysis in analyses:
	result += f"## {slide_title}\n\n{analysis}\n\n---\n\n"

	return result

	except Exception as e:
	import traceback
	error_msg = f"Error analyzing slides: {str(e)}\n{traceback.format_exc()}"
	return error_msg

	# Main function
	def main():
	# Load the model
	model, tokenizer = load_model()

	if model is None:
	# Create an error interface if model loading failed
	demo = gr.Interface(
	fn=lambda x: "Model loading failed. Please check the logs for details.",
	inputs=gr.Textbox(),
	outputs=gr.Textbox(),
	title="InternVL2.5 Slide Analyzer - Error",
	description="The model failed to load. Please check the logs for more information."
	)
	return demo

	# Create a simple interface
	with gr.Blocks(title="InternVL2.5 PDF Slide Analyzer") as demo:
	gr.Markdown("# InternVL2.5 PDF Slide Analyzer")
	gr.Markdown("Upload a PDF file and analyze multiple slides")

	# PDF Analysis tab
	slide_prompts = [
	"Analyze this slide and describe its contents.",
	"What is the main message of this slide?",
	"Extract all the text visible in this slide.",
	"What are the key points presented in this slide?",
	"Describe the visual elements and layout of this slide."
	]

	with gr.Row():
	file_input = gr.File(label="Upload PDF")
	slide_prompt = gr.Dropdown(
	choices=slide_prompts,
	value=slide_prompts[0],
	label="Select a prompt",
	allow_custom_value=True
	)

	num_slides = gr.Slider(
	minimum=1,
	maximum=5,
	value=2,
	step=1,
	label="Number of Slides to Analyze"
	)

	slides_analyze_btn = gr.Button("Analyze Slides")
	slides_output = gr.Markdown(label="Analysis Results")

	# Handle the slides analysis action
	slides_analyze_btn.click(
	fn=lambda file, prompt, num: analyze_pdf_slides(model, tokenizer, file, prompt, num),
	inputs=[file_input, slide_prompt, num_slides],
	outputs=slides_output
	)

	# Add example if available
	if os.path.exists("example_slides/test_slides.pdf"):
	gr.Examples(
	examples=[
	["example_slides/test_slides.pdf", "Extract all the text visible in this slide.", 2]
	],
	inputs=[file_input, slide_prompt, num_slides]
	)

	return demo

	# Run the application
	if __name__ == "__main__":
	try:
	# Create and launch the interface
	demo = main()
	demo.launch(server_name="0.0.0.0")
	except Exception as e:
	print(f"Error starting the application: {e}")
	import traceback
	traceback.print_exc()